[llvm] r218177 - [x86] Teach the v4f32 path of the new shuffle lowering to handle the

Fri Sep 19 21:15:22 PDT 2014

Author: chandlerc
Date: Fri Sep 19 23:15:22 2014
New Revision: 218177

URL: http://llvm.org/viewvc/llvm-project?rev=218177&view=rev
Log:
[x86] Teach the v4f32 path of the new shuffle lowering to handle the
tricky case of single-element insertion into the zero lane of a zero
vector.

We can't just use the same pattern here as we do in every other vector
type because the general insertion logic can handle insertion into the
non-zero lane of the vector. However, in SSE4.1 with v4f32 vectors we
have INSERTPS that is a much better choice than the generic one for such
lowerings. But INSERTPS can do lots of other lowerings as well so
factoring its logic into the general insertion logic doesn't work very
well. We also can't just extract the core common part of the general
insertion logic that is faster (forming VZEXT_MOVL synthetic nodes that
lower to MOVSS when they can) because VZEXT_MOVL is often *faster* than
a blend while INSERTPS is slower! So instead we do a restrictive
condition on attempting to use the generic insertion logic to narrow it
to those cases where VZEXT_MOVL won't need a shuffle afterward and thus
will do better than INSERTPS. Then we try blending. Then we go back to
INSERTPS.

This still doesn't generate perfect code for some silly reasons that can
be fixed by tweaking the td files for lowering VZEXT_MOVL to use
XORPS+BLENDPS when available rather than XORPS+MOVSS when the input ends
up in a register rather than a load from memory -- BLENDPSrr has twice
the reciprocal throughput of MOVSSrr. Don't you love this ISA?

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=218177&r1=218176&r2=218177&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 19 23:15:22 2014
@@ -7784,6 +7784,16 @@ static SDValue lowerV4F32VectorShuffle(S
   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
 
+  // There are special ways we can lower some single-element blends. However, we
+  // have custom ways we can lower more complex single-element blends below that
+  // we defer to if both this and BLENDPS fail to match, so restrict this to
+  // when the V2 input is targeting element 0 of the mask -- that is the fast
+  // case here.
+  if (NumV2Elements == 1 && Mask[0] >= 4)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
   if (Subtarget->hasSSE41())
     if (SDValue Blend =
             lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=218177&r1=218176&r2=218177&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Fri Sep 19 23:15:22 2014
@@ -806,3 +806,74 @@ define <4 x i32> @shuffle_v4i32_0z1z(<4
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
   ret <4 x i32> %shuffle
 }
+
+define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
+; ALL-LABEL: @insert_reg_and_zero_v4i32
+; ALL:       # BB#0:
+; ALL-NEXT:    movd %edi, %xmm0
+; ALL-NEXT:    retq
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
+; ALL-LABEL: @insert_mem_and_zero_v4i32
+; ALL:       # BB#0:
+; ALL-NEXT:    movd (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a = load i32* %ptr
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE2-LABEL: @insert_reg_and_zero_v4f32
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    movss %xmm0, %[[X]]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: @insert_reg_and_zero_v4f32
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE3-NEXT:    movss %xmm0, %[[X]]
+; SSE3-NEXT:    movaps %[[X]], %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: @insert_reg_and_zero_v4f32
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSSE3-NEXT:    movss %xmm0, %[[X]]
+; SSSE3-NEXT:    movaps %[[X]], %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @insert_reg_and_zero_v4f32
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE41-NEXT:    movss %xmm0, %[[X]]
+; SSE41-NEXT:    movaps %[[X]], %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @insert_reg_and_zero_v4f32
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %[[X:xmm[0-9]+]], %[[X]], %[[X]]
+; AVX1-NEXT:    vmovss %xmm0, %[[X]], %xmm0
+; AVX1-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; ALL-LABEL: @insert_mem_and_zero_v4f32
+; ALL:       # BB#0:
+; ALL-NEXT:    movss (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a = load float* %ptr
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}