[PATCH][x86] Teach how to fold target specific dags for packed shifts by immedate count into build_vector.
Andrea Di Biagio
andrea.dibiagio at gmail.com
Tue Dec 24 04:15:10 PST 2013
Ping.
On Mon, Dec 16, 2013 at 9:38 PM, Andrea Di Biagio
<andrea.dibiagio at gmail.com> wrote:
> Hi all,
>
> This patch teaches the x86 backend how to lower packed shifts by
> immedate into build_vector when the vector in input to the shift is a
> vector of ConstantSDNode elements.
>
> I spotted this poor code generation bug while looking at how
> 'sign_extend_inreg' nodes are lowered in X86ISelLowering.cpp.
>
> When lowering SIGN_EXTEND_INREG dag nodes we may end up expanding the
> 'sign_extend_inreg' dag into a sequence of target specific dag nodes
> that perform a packed logical vector shift left by imm followed by a
> packed arithmetic shift right by imm (i.e. X86ISD::VSHLI +
> X86ISD::VSRAI).
>
> In some cases, the sequence of shifts can be folded into a simple
> BUILD_VECTOR node if we know that the vector in input to the VSHLI is
> a vector of all ConstantSDNode.
>
> For example, given the following IR:
>
> /////////////////
> define <4 x float> @foo(<4 x float> %a, <4 x float> %b) {
> %1 = select <4 x i1><i1 false, false, false, false>, <4 x float> %a,
> <4 x float> %b
> ret <4 x float> %1
> }
> /////////////////
>
> The initial Selection DAG would contain the following sequence:
>
> i1 C = Constant<1>
> v4i1 Mask = BUILD_VECTOR C, C, C, C
> v4f32 Result = vselect Mask, %a, %b
>
>
> Before legalization, the BUILD_VECTOR that computes Mask is 'illegal'
> because of its value type. During 'type legalization' stage, the
> BUILD_VECTOR is then replaced by a new (legally typed) BUILD_VECTOR
> followed by a SIGN_EXTEND_INREG.
>
> i32 C = Constant<1>
> v4i32 Mask = BUILD_VECTOR C, C, C, C
> v4i32 SExtMask = SIGN_EXTEND_INREG Mask, ValueType:v4i1
> v4f32 Result = vselect SExtMask, %a, %b
>
>
> Before this patch, (using `llc -mattr=-sse4.1`) the SIGN_EXTEND_INREG
> was lowered into a combination of packed logical shift left + packed
> arithmetic shift right.
>
> With this patch, the two shift instructions produced by the
> 'lowerSIGN_EXTEND_INREG' function are correctly folded into a single
> BUILD_VECTOR when the Mask is a vector of Constants.
>
> Without this patch, the example above produced:
> pxor %xmm2, %xmm2
> pslld $31, %xmm2
> psrad $31, %xmm2
> pand %xmm2, %xmm0
> pandn %xmm1, %xmm2
> por %xmm0, %xmm2
> movdqa %xmm2, %xmm0
> ret
>
>
> With this patch now the compiler produces:
> movaps %xmm1, %xmm0
>
> (More cases have been added to test llvm/test/CodeGen/X86/vselect.ll
> by this patch).
>
> Since this patch touches function 'getTargetVShiftByConstNode' we also
> improve the code generation by x86 builtin intrinsics for 'packed
> shifts by immediate count' (example: @llvm.x86.sse2.pslli.w etc.) in
> the case where the vector in input is a vector of constant numbers.
>
> See for example test cases in file llvm/test/CodeGen/X86/vec_shift5.ll.
>
> Please let me know if this is ok to submit.
>
> Thanks,
> Andrea Di Biagio
> SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 197977)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -11094,16 +11094,29 @@
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
+// Return true if this is a BUILD_VECTOR of ConstantSDNodes.
+// This is a helper function used by `getTargetVShiftByConstNode`.
+static bool isBuildVectorOfConstantSDNodes(SDValue N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+ if (!isa<ConstantSDNode>(N->getOperand(i)))
+ return false;
+ return true;
+}
+
// getTargetVShiftByConstNode - Handle vector element shifts where the shift
// amount is a constant. Takes immediate version of shift as input.
static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue SrcOp, uint64_t ShiftAmt,
SelectionDAG &DAG) {
+ EVT ElementType = VT.getVectorElementType();
// Check for ShiftAmt >= element width
- if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
+ if (ShiftAmt >= ElementType.getSizeInBits()) {
if (Opc == X86ISD::VSRAI)
- ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
+ ShiftAmt = ElementType.getSizeInBits() - 1;
else
return DAG.getConstant(0, VT);
}
@@ -11111,6 +11124,42 @@
assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
&& "Unknown target vector shift-by-constant node");
+ // Try to fold this shift into a build vector.
+ // If SrcOp is a vector of all ConstantSDNode elements then we can statically
+ // compute the resulting vector and propagate the result.
+ if (isBuildVectorOfConstantSDNodes(SrcOp)) {
+ SmallVector<SDValue, 8> Elts;
+ unsigned NumElts = SrcOp->getNumOperands();
+ ConstantSDNode *CurrentND;
+
+ switch(Opc) {
+ default: llvm_unreachable(0);
+ case X86ISD::VSHLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ CurrentND = cast<ConstantSDNode>(SrcOp->getOperand(i));
+ const APInt &C = CurrentND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
+ }
+ break;
+ case X86ISD::VSRLI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ CurrentND = cast<ConstantSDNode>(SrcOp->getOperand(i));
+ const APInt &C = CurrentND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
+ }
+ break;
+ case X86ISD::VSRAI:
+ for (unsigned i=0; i!=NumElts; ++i) {
+ CurrentND = cast<ConstantSDNode>(SrcOp->getOperand(i));
+ const APInt &C = CurrentND->getAPIntValue();
+ Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
+ }
+ break;
+ }
+
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+ }
+
return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
}
Index: test/CodeGen/X86/vselect.ll
===================================================================
--- test/CodeGen/X86/vselect.ll (revision 0)
+++ test/CodeGen/X86/vselect.ll (revision 0)
@@ -0,0 +1,105 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we don't emit packed vector shifts instructions if the
+; condition used by the vector select is a vector of constants.
+
+
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+ %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+ %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+ %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+ %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+ %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
Index: test/CodeGen/X86/blend-msb.ll
===================================================================
--- test/CodeGen/X86/blend-msb.ll (revision 197977)
+++ test/CodeGen/X86/blend-msb.ll (working copy)
@@ -29,9 +29,9 @@
; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not r
; reduce the mask in this case.
;CHECK-LABEL: vsel_8xi16:
-;CHECK: psllw
-;CHECK: psraw
-;CHECK: pblendvb
+;CHECK: andps
+;CHECK: andps
+;CHECK: orps
;CHECK: ret
define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
Index: test/CodeGen/X86/sse2-blend.ll
===================================================================
--- test/CodeGen/X86/sse2-blend.ll (revision 197977)
+++ test/CodeGen/X86/sse2-blend.ll (working copy)
@@ -1,9 +1,9 @@
; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
; CHECK: vsel_float
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK: xorps
+; CHECK: movss
+; CHECK: orps
; CHECK: ret
define void at vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
%A = load <4 x float>* %v1
@@ -14,9 +14,9 @@
}
; CHECK: vsel_i32
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK: xorps
+; CHECK: movss
+; CHECK: orps
; CHECK: ret
define void at vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
%A = load <4 x i32>* %v1
Index: test/CodeGen/X86/2011-12-28-vselecti8.ll
===================================================================
--- test/CodeGen/X86/2011-12-28-vselecti8.ll (revision 197977)
+++ test/CodeGen/X86/2011-12-28-vselecti8.ll (working copy)
@@ -3,10 +3,24 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin11.2.0"
-; CHECK: @foo8
-; CHECK: psll
-; CHECK: psraw
-; CHECK: pblendvb
+; The backend should be able to fold the vector select
+; into a load of a vector from constant pool since all its operands
+; are vectors of integer constants.
+;
+; During legalization, the vselect mask is 'type legalized' into a
+; wider BUILD_VECTOR. This causes the introduction of a new
+; sign_extend_inreg in the DAG.
+;
+; The lowering of sign_extend_inreg eventually generates the sequence
+; psll+psraw. However, a constant shift of a vector of ConstantSDNode
+; elements can always be folded into a simple build_vector.
+;
+; Make sure that we don't generate psll, psraw and pblendvb from the vselect.
+
+; CHECK-LABEL: foo8
+; CHECK-NOT: psll
+; CHECK-NOT: psraw
+; CHECK-NOT: pblendvb
; CHECK: ret
define void @foo8(float* nocapture %RET) nounwind {
allocas:
Index: test/CodeGen/X86/vec_shift5.ll
===================================================================
--- test/CodeGen/X86/vec_shift5.ll (revision 0)
+++ test/CodeGen/X86/vec_shift5.ll (revision 0)
@@ -0,0 +1,90 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we don't emit packed vector shifts instructions if the
+; result is know at compile time.
+; All the packed vector shift by immediate count instructions below should
+; be translated into loads from constant pool of the expected result.
+
+define <8 x i16> @test1() {
+ %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test2() {
+ %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test3() {
+ %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+ ret <8 x i16> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test4() {
+ %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test5() {
+ %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test6() {
+ %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test7() {
+ %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
+ ret <2 x i64> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test8() {
+ %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
+ ret <2 x i64> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)
+declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32)
+
More information about the llvm-commits
mailing list