[llvm] r274990 - [X86][SSE] Add support for target shuffle combining to INSERTPS
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 9 14:47:56 PDT 2016
Author: rksimon
Date: Sat Jul 9 16:47:55 2016
New Revision: 274990
URL: http://llvm.org/viewvc/llvm-project?rev=274990&view=rev
Log:
[X86][SSE] Add support for target shuffle combining to INSERTPS
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/insertps-combine.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=274990&r1=274989&r2=274990&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Jul 9 16:47:55 2016
@@ -8677,16 +8677,14 @@ static SDValue lowerVectorShuffleAsBroad
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
-static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+ unsigned &InsertPSMask,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
unsigned ZMask = 0;
int V1DstIndex = -1;
int V2DstIndex = -1;
@@ -8707,7 +8705,7 @@ static SDValue lowerVectorShuffleAsInser
// We can only insert a single non-zeroable element.
if (V1DstIndex >= 0 || V2DstIndex >= 0)
- return SDValue();
+ return false;
if (Mask[i] < 4) {
// V1 input out of place for insertion.
@@ -8720,7 +8718,7 @@ static SDValue lowerVectorShuffleAsInser
// Don't bother if we have no (non-zeroable) element for insertion.
if (V1DstIndex < 0 && V2DstIndex < 0)
- return SDValue();
+ return false;
// Determine element insertion src/dst indices. The src index is from the
// start of the inserted vector, not the start of the concatenated vector.
@@ -8740,8 +8738,21 @@ static SDValue lowerVectorShuffleAsInser
if (!V1UsedInPlace)
V1 = DAG.getUNDEF(MVT::v4f32);
- unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ // Insert the V2 element into the desired position.
+ InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ // Attempt to match the insertps pattern.
+ unsigned InsertPSMask;
+ if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+ return SDValue();
// Insert the V2 element into the desired position.
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
@@ -25077,6 +25088,33 @@ static bool combineX86ShuffleChain(SDVal
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
+ return true;
+ }
+ }
+
+ // Attempt to combine to INSERTPS.
+ if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
+ (VT == MVT::v2f64 || VT == MVT::v4f32)) {
+ SmallBitVector Zeroable(4, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] < 0)
+ Zeroable[i] = true;
+
+ unsigned InsertPSMask;
+ SDValue V1 = Input, V2 = Input;
+ if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
+ Zeroable, Mask, DAG)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
+ return false; // Nothing to do!
+ V1 = DAG.getBitcast(MVT::v4f32, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(MVT::v4f32, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
return true;
}
}
Modified: llvm/trunk/test/CodeGen/X86/insertps-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/insertps-combine.ll?rev=274990&r1=274989&r2=274990&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/insertps-combine.ll (original)
+++ llvm/trunk/test/CodeGen/X86/insertps-combine.ll Sat Jul 9 16:47:55 2016
@@ -60,17 +60,12 @@ define <4 x float> @shuffle_v4f32_0z24(<
define <4 x float> @shuffle_v4f32_0zz0(float %a) {
; SSE-LABEL: shuffle_v4f32_0zz0:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz0:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; AVX-NEXT: retq
%vecinit = insertelement <4 x float> undef, float %a, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll?rev=274990&r1=274989&r2=274990&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll Sat Jul 9 16:47:55 2016
@@ -190,7 +190,7 @@ define <8 x float> @combine_vpermilvar_8
define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_as_insertps:
; ALL: # BB#0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
; ALL-NEXT: retq
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
%2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
More information about the llvm-commits
mailing list