[PATCH][x86] Teach how to fold target specific dags for packed shifts by immedate count into build_vector.

Tue Dec 24 04:15:10 PST 2013

Ping.

On Mon, Dec 16, 2013 at 9:38 PM, Andrea Di Biagio
<andrea.dibiagio at gmail.com> wrote:
> Hi all,
>
> This patch teaches the x86 backend how to lower packed shifts by
> immedate into build_vector when the vector in input to the shift is a
> vector of ConstantSDNode elements.
>
> I spotted this poor code generation bug while looking at how
> 'sign_extend_inreg' nodes are lowered in X86ISelLowering.cpp.
>
> When lowering SIGN_EXTEND_INREG dag nodes we may end up expanding the
> 'sign_extend_inreg' dag into a sequence of target specific dag nodes
> that perform a packed logical vector shift left by imm followed by a
> packed arithmetic shift right by imm (i.e. X86ISD::VSHLI +
> X86ISD::VSRAI).
>
> In some cases, the sequence of shifts can be folded into a simple
> BUILD_VECTOR node if we know that the vector in input to the VSHLI is
> a vector of all ConstantSDNode.
>
> For example, given the following IR:
>
> /////////////////
> define <4 x float> @foo(<4 x float> %a, <4 x float> %b) {
>   %1 = select <4 x i1><i1 false, false, false, false>, <4 x float> %a,
> <4 x float> %b
>   ret <4 x float> %1
> }
> /////////////////
>
> The initial Selection DAG would contain the following sequence:
>
>   i1 C = Constant<1>
>   v4i1 Mask = BUILD_VECTOR C, C, C, C
>   v4f32 Result = vselect Mask, %a, %b
>
>
> Before legalization, the BUILD_VECTOR that computes Mask is 'illegal'
> because of its value type. During 'type legalization' stage, the
> BUILD_VECTOR is then replaced by a new (legally typed) BUILD_VECTOR
> followed by a SIGN_EXTEND_INREG.
>
>   i32 C = Constant<1>
>   v4i32 Mask = BUILD_VECTOR C, C, C, C
>   v4i32 SExtMask = SIGN_EXTEND_INREG Mask, ValueType:v4i1
>   v4f32 Result = vselect SExtMask, %a, %b
>
>
> Before this patch, (using `llc -mattr=-sse4.1`) the SIGN_EXTEND_INREG
> was lowered into a combination of packed logical shift left + packed
> arithmetic shift right.
>
> With this patch, the two shift instructions produced by the
> 'lowerSIGN_EXTEND_INREG' function are correctly folded into a single
> BUILD_VECTOR when the Mask is a vector of Constants.
>
> Without this patch, the example above produced:
>    pxor %xmm2, %xmm2
>    pslld  $31, %xmm2
>    psrad  $31, %xmm2
>    pand   %xmm2, %xmm0
>    pandn  %xmm1, %xmm2
>    por    %xmm0, %xmm2
>    movdqa %xmm2, %xmm0
>    ret
>
>
> With this patch now the compiler produces:
>    movaps %xmm1, %xmm0
>
> (More cases have been added to test llvm/test/CodeGen/X86/vselect.ll
> by this patch).
>
> Since this patch touches function 'getTargetVShiftByConstNode' we also
> improve the code generation by x86 builtin intrinsics for 'packed
> shifts by immediate count' (example: @llvm.x86.sse2.pslli.w etc.) in
> the case where the vector in input is a vector of constant numbers.
>
> See for example test cases in file llvm/test/CodeGen/X86/vec_shift5.ll.
>
> Please let me know if this is ok to submit.
>
> Thanks,
> Andrea Di Biagio
> SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================

--- lib/Target/X86/X86ISelLowering.cpp	(revision 197977)
+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
@@ -11094,16 +11094,29 @@
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
+// Return true if this is a BUILD_VECTOR of ConstantSDNodes.
+// This is a helper function used by `getTargetVShiftByConstNode`.
+static bool isBuildVectorOfConstantSDNodes(SDValue N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (!isa<ConstantSDNode>(N->getOperand(i)))
+      return false;
+  return true;
+}
+
 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
 // amount is a constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
                                           SDValue SrcOp, uint64_t ShiftAmt,
                                           SelectionDAG &DAG) {
+  EVT ElementType = VT.getVectorElementType();
 
   // Check for ShiftAmt >= element width
-  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
+  if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
-      ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
+      ShiftAmt = ElementType.getSizeInBits() - 1;
     else
       return DAG.getConstant(0, VT);
   }
@@ -11111,6 +11124,42 @@
   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
          && "Unknown target vector shift-by-constant node");
 
+  // Try to fold this shift into a build vector.
+  // If SrcOp is a vector of all ConstantSDNode elements then we can statically
+  // compute the resulting vector and propagate the result.
+  if (isBuildVectorOfConstantSDNodes(SrcOp)) {
+    SmallVector<SDValue, 8> Elts;
+    unsigned NumElts = SrcOp->getNumOperands();
+    ConstantSDNode *CurrentND;
+
+    switch(Opc) {
+      default: llvm_unreachable(0);
+      case X86ISD::VSHLI:
+        for (unsigned i=0; i!=NumElts; ++i) {
+          CurrentND = cast<ConstantSDNode>(SrcOp->getOperand(i));
+          const APInt &C = CurrentND->getAPIntValue();
+          Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
+        }
+        break;
+      case X86ISD::VSRLI:
+        for (unsigned i=0; i!=NumElts; ++i) {
+          CurrentND = cast<ConstantSDNode>(SrcOp->getOperand(i));
+          const APInt &C = CurrentND->getAPIntValue();
+          Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
+        }
+        break;
+      case X86ISD::VSRAI:
+        for (unsigned i=0; i!=NumElts; ++i) {
+          CurrentND = cast<ConstantSDNode>(SrcOp->getOperand(i));
+          const APInt &C = CurrentND->getAPIntValue();
+          Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
+        }
+        break;
+    }
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+  }
+
   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
 }
 
Index: test/CodeGen/X86/vselect.ll
===================================================================
--- test/CodeGen/X86/vselect.ll	(revision 0)
+++ test/CodeGen/X86/vselect.ll	(revision 0)
@@ -0,0 +1,105 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we don't emit packed vector shifts instructions if the
+; condition used by the vector select is a vector of constants.
+
+
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps  %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps  %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
Index: test/CodeGen/X86/blend-msb.ll
===================================================================
--- test/CodeGen/X86/blend-msb.ll	(revision 197977)
+++ test/CodeGen/X86/blend-msb.ll	(working copy)
@@ -29,9 +29,9 @@
 ; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not r
 ; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: psllw
-;CHECK: psraw
-;CHECK: pblendvb
+;CHECK: andps
+;CHECK: andps
+;CHECK: orps
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
Index: test/CodeGen/X86/sse2-blend.ll
===================================================================
--- test/CodeGen/X86/sse2-blend.ll	(revision 197977)
+++ test/CodeGen/X86/sse2-blend.ll	(working copy)
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
 ; CHECK: vsel_float
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK: xorps
+; CHECK: movss
+; CHECK: orps
 ; CHECK: ret
 define void at vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
   %A = load <4 x float>* %v1
@@ -14,9 +14,9 @@
 }
 
 ; CHECK: vsel_i32
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK: xorps
+; CHECK: movss
+; CHECK: orps
 ; CHECK: ret
 define void at vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
   %A = load <4 x i32>* %v1
Index: test/CodeGen/X86/2011-12-28-vselecti8.ll
===================================================================
--- test/CodeGen/X86/2011-12-28-vselecti8.ll	(revision 197977)
+++ test/CodeGen/X86/2011-12-28-vselecti8.ll	(working copy)
@@ -3,10 +3,24 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin11.2.0"
 
-; CHECK: @foo8
-; CHECK: psll
-; CHECK: psraw
-; CHECK: pblendvb
+; The backend should be able to fold the vector select
+; into a load of a vector from constant pool since all its operands
+; are vectors of integer constants.
+;
+; During legalization, the vselect mask is 'type legalized' into a 
+; wider BUILD_VECTOR. This causes the introduction of a new
+; sign_extend_inreg in the DAG.
+;
+; The lowering of sign_extend_inreg eventually generates the sequence
+; psll+psraw. However, a constant shift of a vector of ConstantSDNode
+; elements can always be folded into a simple build_vector.
+;
+; Make sure that we don't generate psll, psraw and pblendvb from the vselect.
+
+; CHECK-LABEL: foo8
+; CHECK-NOT: psll
+; CHECK-NOT: psraw
+; CHECK-NOT: pblendvb
 ; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
 allocas:
Index: test/CodeGen/X86/vec_shift5.ll
===================================================================
--- test/CodeGen/X86/vec_shift5.ll	(revision 0)
+++ test/CodeGen/X86/vec_shift5.ll	(revision 0)
@@ -0,0 +1,90 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we don't emit packed vector shifts instructions if the
+; result is know at compile time.
+; All the packed vector shift by immediate count instructions below should
+; be translated into loads from constant pool of the expected result.
+
+define <8 x i16> @test1() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test2() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test3() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test4() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test5() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test6() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test7() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test8() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)
+declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32)
+