[PATCH] D60852: Fix for bug 41512: lower INSERT_VECTOR_ELT(ZeroVec, 0, Elt) to SCALAR_TO_VECTOR(Elt) for all SSE flavors
Serge Preis via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 17 22:21:31 PDT 2019
Serge_Preis created this revision.
Serge_Preis added reviewers: craig.topper, spatel.
Serge_Preis added a project: LLVM.
Herald added subscribers: llvm-commits, hiraditya.
Current LLVM uses pxor+pinsrb on SSE4+ for INSERT_VECTOR_ELT(ZeroVec, 0, Elt) insead of much simpler movd.
INSERT_VECTOR_ELT(ZeroVec, 0, Elt) is idiomatic construct which is used e.g. for _mm_cvtsi32_si128(Elt) and for lowest element initialization in _mm_set_epi32.
So such inefficient lowering leads to significant performance digradations in ceratin cases switching from SSSE3 to SSE4.
Here INSERT_VECTOR_ELT(ZeroVec, 0, Elt) is simply converted to SCALAR_TO_VECTOR(Elt) when applicable since latter is closer match to desired behavior and always efficiently lowered to movd and alike.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D60852
Files:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vec_insert_first.ll
Index: llvm/test/CodeGen/X86/vec_insert_first.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/vec_insert_first.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s
+
+define dso_local void @_Z4LoopPKiRDv2_xjj(i32* nocapture readonly %d, <2 x i64>* nocapture dereferenceable(16) %res, i32 %st1, i32 %st2) local_unnamed_addr #0 {
+
+; CHECK-NOT: pinsr
+entry:
+ %idxprom.i = zext i32 %st1 to i64
+ %arrayidx.i = getelementptr inbounds i32, i32* %d, i64 %idxprom.i
+ %idxprom1.i = zext i32 %st2 to i64
+ %arrayidx2.i = getelementptr inbounds i32, i32* %d, i64 %idxprom1.i
+ %0 = load i32, i32* %arrayidx.i, align 4
+ %vecinit3.i.i = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %0, i32 0
+ %1 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64>
+ %2 = load i32, i32* %arrayidx2.i, align 4
+ %vecinit3.i10.i = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %2, i32 0
+ %3 = bitcast <4 x i32> %vecinit3.i10.i to <2 x i64>
+ %4 = load <2 x i64>, <2 x i64>* %res, align 16
+ %shuffle.i.i = shufflevector <2 x i64> %1, <2 x i64> %3, <2 x i32> <i32 0, i32 2>
+ %add.i.i = add <2 x i64> %shuffle.i.i, %4
+ store <2 x i64> %add.i.i, <2 x i64>* %res, align 16
+ %5 = load i32, i32* %arrayidx.i, align 4
+ %vecinit3.i.i.1 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %5, i32 0
+ %6 = bitcast <4 x i32> %vecinit3.i.i.1 to <2 x i64>
+ %7 = load i32, i32* %arrayidx2.i, align 4
+ %vecinit3.i10.i.1 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %7, i32 0
+ %8 = bitcast <4 x i32> %vecinit3.i10.i.1 to <2 x i64>
+ %shuffle.i.i.1 = shufflevector <2 x i64> %6, <2 x i64> %8, <2 x i32> <i32 0, i32 2>
+ %add.i.i.1 = add <2 x i64> %shuffle.i.i.1, %add.i.i
+ store <2 x i64> %add.i.i.1, <2 x i64>* %res, align 16
+ %9 = load i32, i32* %arrayidx.i, align 4
+ %vecinit3.i.i.2 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %9, i32 0
+ %10 = bitcast <4 x i32> %vecinit3.i.i.2 to <2 x i64>
+ %11 = load i32, i32* %arrayidx2.i, align 4
+ %vecinit3.i10.i.2 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %11, i32 0
+ %12 = bitcast <4 x i32> %vecinit3.i10.i.2 to <2 x i64>
+ %shuffle.i.i.2 = shufflevector <2 x i64> %10, <2 x i64> %12, <2 x i32> <i32 0, i32 2>
+ %add.i.i.2 = add <2 x i64> %shuffle.i.i.2, %add.i.i.1
+ store <2 x i64> %add.i.i.2, <2 x i64>* %res, align 16
+ %13 = load i32, i32* %arrayidx.i, align 4
+ %vecinit3.i.i.3 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %13, i32 0
+ %14 = bitcast <4 x i32> %vecinit3.i.i.3 to <2 x i64>
+ %15 = load i32, i32* %arrayidx2.i, align 4
+ %vecinit3.i10.i.3 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %15, i32 0
+ %16 = bitcast <4 x i32> %vecinit3.i10.i.3 to <2 x i64>
+ %shuffle.i.i.3 = shufflevector <2 x i64> %14, <2 x i64> %16, <2 x i32> <i32 0, i32 2>
+ %add.i.i.3 = add <2 x i64> %shuffle.i.i.3, %add.i.i.2
+ store <2 x i64> %add.i.i.3, <2 x i64>* %res, align 16
+ ret void
+}
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16966,6 +16966,15 @@
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+ // This will be just movd/movq/movss/movsd
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ (EltVT == MVT::i64 && Subtarget.is64Bit())) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+ }
+
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument. SSE41 required for pinsrb.
if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D60852.195673.patch
Type: text/x-patch
Size: 4112 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190418/b4a8143d/attachment.bin>
More information about the llvm-commits
mailing list