[PATCH] D60852: Fix for bug 41512: lower INSERT_VECTOR_ELT(ZeroVec, 0, Elt) to SCALAR_TO_VECTOR(Elt) for all SSE flavors

Wed Apr 17 22:21:31 PDT 2019

Serge_Preis created this revision.
Serge_Preis added reviewers: craig.topper, spatel.
Serge_Preis added a project: LLVM.
Herald added subscribers: llvm-commits, hiraditya.

Current LLVM uses pxor+pinsrb on SSE4+ for INSERT_VECTOR_ELT(ZeroVec, 0, Elt) insead of much simpler movd. 
INSERT_VECTOR_ELT(ZeroVec, 0, Elt) is idiomatic construct which is used e.g. for _mm_cvtsi32_si128(Elt) and for lowest element initialization in _mm_set_epi32. 
So such inefficient lowering leads to significant performance digradations in ceratin cases switching from SSSE3 to SSE4.

Here INSERT_VECTOR_ELT(ZeroVec, 0, Elt) is simply converted to SCALAR_TO_VECTOR(Elt) when applicable since latter is closer match to desired behavior and always efficiently lowered to movd and alike.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D60852

Files:
  llvm/lib/Target/X86/X86ISelLowering.cpp
  llvm/test/CodeGen/X86/vec_insert_first.ll


Index: llvm/test/CodeGen/X86/vec_insert_first.ll
===================================================================

--- /dev/null
+++ llvm/test/CodeGen/X86/vec_insert_first.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s
+
+define dso_local void @_Z4LoopPKiRDv2_xjj(i32* nocapture readonly %d, <2 x i64>* nocapture dereferenceable(16) %res, i32 %st1, i32 %st2) local_unnamed_addr #0 {
+
+; CHECK-NOT: pinsr
+entry:
+  %idxprom.i = zext i32 %st1 to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %d, i64 %idxprom.i
+  %idxprom1.i = zext i32 %st2 to i64
+  %arrayidx2.i = getelementptr inbounds i32, i32* %d, i64 %idxprom1.i
+  %0 = load i32, i32* %arrayidx.i, align 4
+  %vecinit3.i.i = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %0, i32 0
+  %1 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64>
+  %2 = load i32, i32* %arrayidx2.i, align 4
+  %vecinit3.i10.i = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %2, i32 0
+  %3 = bitcast <4 x i32> %vecinit3.i10.i to <2 x i64>
+  %4 = load <2 x i64>, <2 x i64>* %res, align 16
+  %shuffle.i.i = shufflevector <2 x i64> %1, <2 x i64> %3, <2 x i32> <i32 0, i32 2>
+  %add.i.i = add <2 x i64> %shuffle.i.i, %4
+  store <2 x i64> %add.i.i, <2 x i64>* %res, align 16
+  %5 = load i32, i32* %arrayidx.i, align 4
+  %vecinit3.i.i.1 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %5, i32 0
+  %6 = bitcast <4 x i32> %vecinit3.i.i.1 to <2 x i64>
+  %7 = load i32, i32* %arrayidx2.i, align 4
+  %vecinit3.i10.i.1 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %7, i32 0
+  %8 = bitcast <4 x i32> %vecinit3.i10.i.1 to <2 x i64>
+  %shuffle.i.i.1 = shufflevector <2 x i64> %6, <2 x i64> %8, <2 x i32> <i32 0, i32 2>
+  %add.i.i.1 = add <2 x i64> %shuffle.i.i.1, %add.i.i
+  store <2 x i64> %add.i.i.1, <2 x i64>* %res, align 16
+  %9 = load i32, i32* %arrayidx.i, align 4
+  %vecinit3.i.i.2 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %9, i32 0
+  %10 = bitcast <4 x i32> %vecinit3.i.i.2 to <2 x i64>
+  %11 = load i32, i32* %arrayidx2.i, align 4
+  %vecinit3.i10.i.2 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %11, i32 0
+  %12 = bitcast <4 x i32> %vecinit3.i10.i.2 to <2 x i64>
+  %shuffle.i.i.2 = shufflevector <2 x i64> %10, <2 x i64> %12, <2 x i32> <i32 0, i32 2>
+  %add.i.i.2 = add <2 x i64> %shuffle.i.i.2, %add.i.i.1
+  store <2 x i64> %add.i.i.2, <2 x i64>* %res, align 16
+  %13 = load i32, i32* %arrayidx.i, align 4
+  %vecinit3.i.i.3 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %13, i32 0
+  %14 = bitcast <4 x i32> %vecinit3.i.i.3 to <2 x i64>
+  %15 = load i32, i32* %arrayidx2.i, align 4
+  %vecinit3.i10.i.3 = insertelement <4 x i32> <i32 undef, i32 0, i32 undef, i32 undef>, i32 %15, i32 0
+  %16 = bitcast <4 x i32> %vecinit3.i10.i.3 to <2 x i64>
+  %shuffle.i.i.3 = shufflevector <2 x i64> %14, <2 x i64> %16, <2 x i32> <i32 0, i32 2>
+  %add.i.i.3 = add <2 x i64> %shuffle.i.i.3, %add.i.i.2
+  store <2 x i64> %add.i.i.3, <2 x i64>* %res, align 16
+  ret void
+}
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16966,6 +16966,15 @@
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
+  // This will be just movd/movq/movss/movsd
+  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
+    if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+        (EltVT == MVT::i64 && Subtarget.is64Bit())) {
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+      return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+    }
+  }
+
   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   // argument. SSE41 required for pinsrb.
   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D60852.195673.patch
Type: text/x-patch
Size: 4112 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190418/b4a8143d/attachment.bin>