[llvm] r217117 - [x86] Teach the new vector shuffle lowering about the zero masking

Wed Sep 3 18:13:48 PDT 2014

Author: chandlerc
Date: Wed Sep  3 20:13:48 2014
New Revision: 217117

URL: http://llvm.org/viewvc/llvm-project?rev=217117&view=rev
Log:
[x86] Teach the new vector shuffle lowering about the zero masking
abilities of INSERTPS which are really powerful and come up in very
important contexts such as forming diagonal matrices, etc.

With this I ended up being able to remove the somewhat weird helper
I added for INSERTPS because we can collapse the entire state to a no-op
mask. Added a bunch of tests for inserting into a zero-ish vector.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=217117&r1=217116&r2=217117&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep  3 20:13:48 2014
@@ -7182,21 +7182,6 @@ static bool isSingleInputShuffleMask(Arr
   return true;
 }
 
-/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
-///
-/// Mask entries pointing at the other input or undef will be skipped.
-static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i) {
-    int M = Mask[i];
-    if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
-      continue;
-    if (M - (LoInput ? 0 : Size) != i)
-      return false;
-  }
-  return true;
-}
-
 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
 // 2013 will allow us to use it as a non-type template parameter.
 namespace {
@@ -7385,13 +7370,48 @@ static SDValue lowerV4F32VectorShuffle(S
     // INSERTPS when the V1 elements are already in the correct locations
     // because otherwise we can just always use two SHUFPS instructions which
     // are much smaller to encode than a SHUFPS and an INSERTPS.
-    if (Subtarget->hasSSE41() &&
-        isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
-      // Insert the V2 element into the desired position.
-      SDValue InsertPSMask =
-          DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
-      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                         InsertPSMask);
+    if (Subtarget->hasSSE41()) {
+      // When using INSERTPS we can zero any lane of the destination. Collect
+      // the zero inputs into a mask and drop them from the lanes of V1 which
+      // actually need to be present as inputs to the INSERTPS.
+      unsigned ZMask = 0;
+      if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+        ZMask = 0xF ^ (1 << V2Index);
+      } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+        for (int i = 0; i < 4; ++i) {
+          int M = Mask[i];
+          if (M >= 4)
+            continue;
+          if (M > -1) {
+            SDValue Input = V1.getOperand(M);
+            if (Input.getOpcode() != ISD::UNDEF &&
+                !X86::isZeroNode(Input)) {
+              // A non-zero input!
+              ZMask = 0;
+              break;
+            }
+          }
+          ZMask |= 1 << i;
+        }
+      }
+
+      // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+      int InsertShuffleMask[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i)
+        if (i != V2Index && (ZMask & (1 << i)) == 0)
+          InsertShuffleMask[i] = Mask[i];
+
+      if (isNoopShuffleMask(InsertShuffleMask)) {
+        // Replace V1 with undef if nothing from V1 survives the INSERTPS.
+        if ((ZMask | 1 << V2Index) == 0xF)
+          V1 = DAG.getUNDEF(MVT::v4f32);
+
+        // Insert the V2 element into the desired position.
+        SDValue InsertPSMask =
+            DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask);
+        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                           InsertPSMask);
+      }
     }
 
     // Compute the index adjacent to V2Index and in the same half by toggling

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=217117&r1=217116&r2=217117&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Wed Sep  3 20:13:48 2014
@@ -207,3 +207,113 @@ define <4 x i32> @shuffle_v4i32_4015(<4
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i32> %shuffle
 }
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_4zzz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][1,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X]][2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_4zzz
+; SSE41:         insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_4zzz
+; AVX1:         vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z4zz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][2,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][3,0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z4zz
+; SSE41:         insertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z4zz
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zz4z
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][0,0]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,0],xmm0[0,2]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zz4z
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zz4z
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zuu4
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zuu4
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zuu4
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zzz7
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[3,0],[[X]][2,0]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zzz7
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zzz7
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z6zz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][0,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z6zz
+; SSE41:         insertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z6zz
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}