[llvm] r217100 - [x86] Teach the new vector shuffle lowering about the simplest of

Wed Sep 3 15:48:34 PDT 2014

Author: chandlerc
Date: Wed Sep  3 17:48:34 2014
New Revision: 217100

URL: http://llvm.org/viewvc/llvm-project?rev=217100&view=rev
Log:
[x86] Teach the new vector shuffle lowering about the simplest of
'insertps' patterns.

This replaces two shuffles with a single insertps in very common cases.
My next patch will extend this to leverage the zeroing capabilities of
insertps which will allow it to be used in a much wider set of cases.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=217100&r1=217099&r2=217100&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep  3 17:48:34 2014
@@ -7182,6 +7182,21 @@ static bool isSingleInputShuffleMask(Arr
   return true;
 }
 
+/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
+///
+/// Mask entries pointing at the other input or undef will be skipped.
+static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    int M = Mask[i];
+    if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
+      continue;
+    if (M - (LoInput ? 0 : Size) != i)
+      return false;
+  }
+  return true;
+}
+
 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
 // 2013 will allow us to use it as a non-type template parameter.
 namespace {
@@ -7365,6 +7380,20 @@ static SDValue lowerV4F32VectorShuffle(S
     int V2Index =
         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
         Mask.begin();
+
+    // Check for whether we can use INSERTPS to perform the blend. We only use
+    // INSERTPS when the V1 elements are already in the correct locations
+    // because otherwise we can just always use two SHUFPS instructions which
+    // are much smaller to encode than a SHUFPS and an INSERTPS.
+    if (Subtarget->hasSSE41() &&
+        isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
+      // Insert the V2 element into the desired position.
+      SDValue InsertPSMask =
+          DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
+      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                         InsertPSMask);
+    }
+
     // Compute the index adjacent to V2Index and in the same half by toggling
     // the low bit.
     int V2AdjIndex = V2Index ^ 1;

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=217100&r1=217099&r2=217100&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Wed Sep  3 17:48:34 2014
@@ -121,10 +121,18 @@ define <4 x float> @shuffle_v4f32_3210(<
 }
 
 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
-; ALL-LABEL: @shuffle_v4i32_0124
-; ALL:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; ALL-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
-; ALL-NEXT:    retq
+; SSE2-LABEL: @shuffle_v4i32_0124
+; SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4i32_0124
+; SSE41:         insertps {{.*}} # xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4i32_0124
+; AVX1:         vinsertps {{.*}} # xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %shuffle
 }