[llvm] r208271 - Lower certain build_vectors to insertps instructions
Filipe Cabecinhas
me at filcab.net
Wed May 7 17:25:17 PDT 2014
Author: filcab
Date: Wed May 7 19:25:16 2014
New Revision: 208271
URL: http://llvm.org/viewvc/llvm-project?rev=208271&view=rev
Log:
Lower certain build_vectors to insertps instructions
Summary:
Vectors built with zeros and elements in the same order as another
(source) vector are optimized to be built using a single insertps
instruction.
Also optimize when we move one element in a vector to a different place
in that vector while zeroing out some of the other elements.
Further optimizations are possible, described in TODO comments.
I will be implementing at least some of them in the near future.
Added some tests for different cases where this optimization triggers.
Reviewers: nadav, delena, craig.topper
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D3521
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/sse41.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=208271&r1=208270&r2=208271&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed May 7 19:25:16 2014
@@ -5437,6 +5437,74 @@ static SDValue LowerBuildVectorv8i16(SDV
return V;
}
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+ unsigned NonZeros, unsigned NumNonZero,
+ unsigned NumZero, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget,
+ const TargetLowering &TLI) {
+ // We know there's at least one non-zero element
+ unsigned FirstNonZeroIdx = 0;
+ SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+ X86::isZeroNode(FirstNonZero)) {
+ ++FirstNonZeroIdx;
+ FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ }
+
+ if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+ return SDValue();
+
+ SDValue V = FirstNonZero.getOperand(0);
+ unsigned FirstNonZeroDst = cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+ unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+ unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+ unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+ for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+ SDValue Elem = Op.getOperand(Idx);
+ if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+ continue;
+
+ // TODO: What else can be here? Deal with it.
+ if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // TODO: Some optimizations are still possible here
+ // ex: Getting one element from a vector, and the rest from another.
+ if (Elem.getOperand(0) != V)
+ return SDValue();
+
+ unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+ if (Dst == Idx)
+ ++CorrectIdx;
+ else if (IncorrectIdx == -1U) {
+ IncorrectIdx = Idx;
+ IncorrectDst = Dst;
+ } else
+ // There was already one element with an incorrect index.
+ // We can't optimize this case to an insertps.
+ return SDValue();
+ }
+
+ if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+ SDLoc dl(Op);
+ EVT VT = Op.getSimpleValueType();
+ unsigned ElementMoveMask = 0;
+ if (IncorrectIdx == -1U)
+ ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+ else
+ ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+ SDValue InsertpsMask = DAG.getIntPtrConstant(
+ ElementMoveMask | (~NonZeros & 0xf));
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+ }
+
+ return SDValue();
+}
+
/// getVShift - Return a vector logical shift node.
///
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -6187,6 +6255,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDV
if (V.getNode()) return V;
}
+ // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+ if (EVTBits == 32 && NumElems == 4) {
+ SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+ NumZero, DAG, Subtarget, *this);
+ if (V.getNode())
+ return V;
+ }
+
// If element VT is == 32 bits, turn it into a number of shuffles.
SmallVector<SDValue, 8> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
Modified: llvm/trunk/test/CodeGen/X86/sse41.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41.ll?rev=208271&r1=208270&r2=208271&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41.ll Wed May 7 19:25:16 2014
@@ -320,3 +320,259 @@ define <4 x i32> @insertps_from_load_ins
%result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
ret <4 x i32> %result
}
+
+;;;;;; Shuffles optimizable with a single insertps instruction
+define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $8
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecext3 = extractelement <4 x float> %x, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+ %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+ ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $12
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $104
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
+ %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+ ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYW0:
+; CHECK: insertps $232
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecext2 = extractelement <4 x float> %x, i32 3
+ %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $198
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 3
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+ %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+ %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps $176
+; CHECK: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+ %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x float> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $8
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecext3 = extractelement <4 x i32> %x, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+ %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+ ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $12
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $104
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
+ %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+ ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYW0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $232
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecext1 = extractelement <4 x i32> %x, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+ %vecext2 = extractelement <4 x i32> %x, i32 3
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps $198
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 3
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+ %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+ %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps $48
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+ %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps $176
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %x, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+ %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+ %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x i32> %vecinit5
+}
+
+;; Test for a bug in the first implementation of LowerBuildVectorv4x32
+define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
+; CHECK-LABEL: test_insertps_no_undef:
+; CHECK: movaps %xmm0, %xmm1
+; CHECK-NEXT: insertps $8, %xmm1, %xmm1
+; CHECK-NEXT: maxps %xmm1, %xmm0
+; CHECK-NEXT: ret
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecext1 = extractelement <4 x float> %x, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+ %vecext3 = extractelement <4 x float> %x, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+ %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+ %mask = fcmp olt <4 x float> %vecinit5, %x
+ %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
+ ret <4 x float> %res
+}
More information about the llvm-commits
mailing list