[PATCH][X86] Teach the backend how to lower vector shift left into multiply rather than scalarizing it.

Thu Feb 6 11:45:53 PST 2014

Hi,

This patch teaches the backend how to efficiently lower a packed
vector shift left into a packed vector multiply if the vector of shift
counts is known to be constant (i.e. a constant build_vector).

Instead of expanding a packed shift into a sequence of scalar shifts,
the backend should try (when possible) to convert the vector shift
into a vector multiply.

Before this patch, a shift of a MVT::v8i16 vector by a build_vector of
constants was always scalarized into a long sequence of "vector
extracts + scalar shifts + vector insert".
With this patch, if there is SSE2 support, we emit a single vector multiply.

The new x86 test 'vec_shift6.ll' contains some examples of code that
are affected by this patch.

Please let me know if ok to submit.

Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================

--- lib/Target/X86/X86ISelLowering.cpp	(revision 200941)
+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
@@ -13152,6 +13152,38 @@
       return Op;
   }
 
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  // Do this only if the vector type is either v8i16 or v4i32 and if the shift
+  // count is a constant build_vector.
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Op.getOpcode() == ISD::SHL &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    EVT SVT = VT.getScalarType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    const APInt &One = APInt(SVTBits, 1);
+    unsigned NumElems = VT.getVectorNumElements();
+
+    for (unsigned i=0; i !=NumElems; ++i) {
+      SDValue Op = Amt->getOperand(i);
+      if (Op->getOpcode() == ISD::UNDEF) {
+        Elts.push_back(Op);
+        continue;
+      }
+
+      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+      const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+      uint64_t ShAmt = C.getZExtValue();
+      if (ShAmt >= SVTBits) {
+        Elts.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
+    }
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+  }
+ 
   // Lower SHL with variable shift amount.
   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
Index: test/CodeGen/X86/avx-shift.ll
===================================================================
--- test/CodeGen/X86/avx-shift.ll	(revision 200941)
+++ test/CodeGen/X86/avx-shift.ll	(working copy)
@@ -115,8 +115,8 @@
 ; PR15141
 ; CHECK: _vshift13:
 ; CHECK-NOT: vpsll
-; CHECK: vcvttps2dq
-; CHECK-NEXT: vpmulld
+; CHECK-NOT: vcvttps2dq
+; CHECK: vpmulld
 define <4 x i32> @vshift13(<4 x i32> %in) {
   %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %T
Index: test/CodeGen/X86/vec_shift6.ll
===================================================================
--- test/CodeGen/X86/vec_shift6.ll	(revision 0)
+++ test/CodeGen/X86/vec_shift6.ll	(revision 0)
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+sse4.1 | FileCheck %s
+
+; Verify that we don't scalarize a packed vector shift left of 16-bit
+; signed integers if the amount is a constant build_vector.
+; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <8 x i16> %shl
+}
+; CHECK-LABEL: test1
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
+  ret <8 x i16> %shl
+}
+; CHECK-LABEL: test2
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+; Verify that a vector shift left of 32-bit signed integers is simply expanded
+; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
+; counts is a constant build_vector.
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
+  ret <4 x i32> %shl
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: cvttps2dq
+; CHECK: pmulld
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x i32> %shl
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: cvttps2dq
+; CHECK: pmulld
+; CHECK-NEXT: ret
+