[PATCH][InstCombine][X86] Improve the folding of calls to X86 packed shifts intrinsics.

Thu May 8 07:28:00 PDT 2014

Hi Jim,

As you suggested, here is a patch that teaches the x86 backend how to
combine/fold packed SSE2/AVX2 arithmetic shifts intrinsic dag nodes.
The logic is basically the same; we always fold shifts-by-zero into
the first operand; we lower arithmetic shift intrinsic dag nodes into
a ISD::SRA dag nodes only if the shift count is known to be smaller
than the vector element size. The latter is to take advantage of the
existing target independent combine rules in DAGCombiner.

I also added a if-stmt in function 'getTargetVShiftByConstNode' to
check if we have a shift-by-zero and trivially fold it into the first
operand.
We
I added two new tests to verify that DAGCombiner is able to fold
sequences of sse2/avx2 packed arithmetic shift calls.

Please let me know if ok to submit.

Thanks!
Andrea

On Thu, May 8, 2014 at 10:18 AM, Andrea Di Biagio
<andrea.dibiagio at gmail.com> wrote:
> Hi Jim,
> Thanks for the feedback!
>
> I'll try to move these changes to the backend.
>
> Cheers,
> Andrea
>
> On 8 May 2014 03:18, "Jim Grosbach" <grosbach at apple.com> wrote:
>>
>> Hi Andrea,
>>
>> I’m really excited to see these patches continuing. Our vector codegen has
>> been needing exactly this sort of detail oriented tuning for a long time
>> now.
>>
>> These are both good improvements, but would be better as DAGCombines in
>> the X86 backend. The main argument for doing these intrinsic combines at the
>> IR level is when the input expression is likely to be split across multiple
>> basic blocks by the time the backend sees it and would thus not be
>> recognized by a DAG combiner. Both of these transforms should avoid that
>> problem, though, and so can be dealt with there.
>>
>> -Jim
>>
>> On May 7, 2014, at 8:42 AM, Andrea Di Biagio <andrea.dibiagio at gmail.com>
>> wrote:
>>
>> > Hi,
>> >
>> > This patch teaches InstCombine how to fold a packed SSE2/AVX2 shift
>> > intrinsic into its first operand if the shift count is a zerovector
>> > (i.e. a 'ConstantAggregateZero’).
>> > Also, this patch teaches InstCombine how to lower a packed arithmetic
>> > shift intrinsics into an 'ashr' instruction if the shift count is
>> > known to be smaller than the vector element size.
>> >
>> > Please let me know if ok to submit.
>> >
>> > Thanks,
>> > Andrea Di Biagio
>> > SN Systems - Sony Computer Entertainment Group
>> > <patch-instcombine-vshifts.diff>
>>
>
-------------- next part --------------
Index: test/CodeGen/X86/combine-avx2-intrinsics.ll
===================================================================

--- test/CodeGen/X86/combine-avx2-intrinsics.ll	(revision 0)
+++ test/CodeGen/X86/combine-avx2-intrinsics.ll	(working copy)
@@ -0,0 +1,51 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
+
+; Verify that the backend correctly combines AVX2 builtin intrinsics.
+
+
+define <8 x i32> @test_psra_1(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: vpsrad $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_2(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: vpsraw $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_3(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: vpsraw
+; CHECK: ret
+
+define <8 x i32> @test_psra_4(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: vpsrad
+; CHECK: ret
+
+
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)
+
Index: test/CodeGen/X86/combine-sse2-intrinsics.ll
===================================================================
--- test/CodeGen/X86/combine-sse2-intrinsics.ll	(revision 0)
+++ test/CodeGen/X86/combine-sse2-intrinsics.ll	(working copy)
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Verify that the backend correctly combines SSE2 builtin intrinsics.
+
+
+define <4 x i32> @test_psra_1(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: psrad $8, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_psra_2(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: psraw $8, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_psra_3(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: psrad
+; CHECK: ret
+
+
+define <8 x i16> @test_psra_4(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp	(revision 208315)
+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
@@ -1556,6 +1556,7 @@
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -11615,6 +11616,10 @@
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
   // Check for ShiftAmt >= element width
   if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
@@ -18484,6 +18489,55 @@
   return SDValue();
 }
 
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();
+  // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    EVT VT = Op0.getValueType();
+    assert(VT.isVector() && "Expected a vector type!");
+
+    if (isa<BuildVectorSDNode>(Op1))
+      Op1 = Op1.getOperand(0);
+
+    if (!isa<ConstantSDNode>(Op1))
+      return SDValue();
+
+    EVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+
+    ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+    const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+    uint64_t ShAmt = C.getZExtValue();
+
+    // Don't try to convert this shift into a ISD::SRA if the shift
+    // count is bigger than or equal to the element size.
+    if (ShAmt >= SVTBits)
+      return SDValue();
+
+    // Trivial case: if the shift count is zero, then fold this
+    // into the first operand.
+    if (ShAmt == 0)
+      return Op0;
+
+    // Replace this packed shift intrinsic with a target independent
+    // shift dag node.
+    SDValue Splat = DAG.getConstant(C, VT);
+    return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+  }
+  }
+}
+
 /// PerformMulCombine - Optimize a single multiply with constant into two
 /// in order to implement it with two cheaper instructions, e.g.
 /// LEA + SHL, LEA + LEA.
@@ -20304,6 +20358,7 @@
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG);
   }
 
   return SDValue();