[PATCH][X86] Teach the backend how to lower vector shift left into multiply rather than scalarizing it.
Andrea Di Biagio
andrea.dibiagio at gmail.com
Thu Feb 6 11:45:53 PST 2014
Hi,
This patch teaches the backend how to efficiently lower a packed
vector shift left into a packed vector multiply if the vector of shift
counts is known to be constant (i.e. a constant build_vector).
Instead of expanding a packed shift into a sequence of scalar shifts,
the backend should try (when possible) to convert the vector shift
into a vector multiply.
Before this patch, a shift of a MVT::v8i16 vector by a build_vector of
constants was always scalarized into a long sequence of "vector
extracts + scalar shifts + vector insert".
With this patch, if there is SSE2 support, we emit a single vector multiply.
The new x86 test 'vec_shift6.ll' contains some examples of code that
are affected by this patch.
Please let me know if ok to submit.
Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 200941)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -13152,6 +13152,38 @@
return Op;
}
+ // If possible, lower this packed shift into a vector multiply instead of
+ // expanding it into a sequence of scalar shifts.
+ // Do this only if the vector type is either v8i16 or v4i32 and if the shift
+ // count is a constant build_vector.
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Op.getOpcode() == ISD::SHL &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ SmallVector<SDValue, 8> Elts;
+ EVT SVT = VT.getScalarType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ const APInt &One = APInt(SVTBits, 1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ for (unsigned i=0; i !=NumElems; ++i) {
+ SDValue Op = Amt->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF) {
+ Elts.push_back(Op);
+ continue;
+ }
+
+ ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+ const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+ if (ShAmt >= SVTBits) {
+ Elts.push_back(DAG.getUNDEF(SVT));
+ continue;
+ }
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+ return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+ }
+
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
Index: test/CodeGen/X86/avx-shift.ll
===================================================================
--- test/CodeGen/X86/avx-shift.ll (revision 200941)
+++ test/CodeGen/X86/avx-shift.ll (working copy)
@@ -115,8 +115,8 @@
; PR15141
; CHECK: _vshift13:
; CHECK-NOT: vpsll
-; CHECK: vcvttps2dq
-; CHECK-NEXT: vpmulld
+; CHECK-NOT: vcvttps2dq
+; CHECK: vpmulld
define <4 x i32> @vshift13(<4 x i32> %in) {
%T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %T
Index: test/CodeGen/X86/vec_shift6.ll
===================================================================
--- test/CodeGen/X86/vec_shift6.ll (revision 0)
+++ test/CodeGen/X86/vec_shift6.ll (revision 0)
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+sse4.1 | FileCheck %s
+
+; Verify that we don't scalarize a packed vector shift left of 16-bit
+; signed integers if the amount is a constant build_vector.
+; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+ %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+ ret <8 x i16> %shl
+}
+; CHECK-LABEL: test1
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+ %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
+ ret <8 x i16> %shl
+}
+; CHECK-LABEL: test2
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+; Verify that a vector shift left of 32-bit signed integers is simply expanded
+; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
+; counts is a constant build_vector.
+
+define <4 x i32> @test3(<4 x i32> %a) {
+ %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
+ ret <4 x i32> %shl
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: cvttps2dq
+; CHECK: pmulld
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+ %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
+ ret <4 x i32> %shl
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: cvttps2dq
+; CHECK: pmulld
+; CHECK-NEXT: ret
+
More information about the llvm-commits
mailing list