[llvm] r211974 - [x86] Add handling for splat-like widenings of v16i8 shuffles.

Fri Jun 27 22:16:41 PDT 2014

Author: chandlerc
Date: Sat Jun 28 00:16:40 2014
New Revision: 211974

URL: http://llvm.org/viewvc/llvm-project?rev=211974&view=rev
Log:
[x86] Add handling for splat-like widenings of v16i8 shuffles.

These show up really frequently, not the least with actual splats. =] We
lowered these quite badly before. The new code path tries to widen i8
shuffles to i16 shuffles in a splat-like way. There are still some
inefficiencies in our i16 splat logic though, so we aren't really done
here.

Also, for certain patterns (bit of a gather-and-splat) we still
generate pretty silly code, and I've left a fixme for addressing it.
However, I'm not actually worried about this code pattern as much. The
old shuffle lowering generates a 29 instruction monstrosity for it that
should execute much more slowly.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=211974&r1=211973&r2=211974&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Jun 28 00:16:40 2014
@@ -7690,6 +7690,86 @@ static SDValue lowerV16I8VectorShuffle(S
   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
 
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != Mask[i + 1])
+          return false;
+      }
+      return true;
+    };
+    if (canWidenViaDuplication(Mask)) {
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int ByteMask[16];
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int i = 0; i < 16; ++i)
+        ByteMask[i] = -1;
+      for (int I : InPlaceInputs) {
+        ByteMask[I] = I;
+        LaneMap[I] = I;
+      }
+      int FreeByteIdx = 0;
+      int TargetOffset = TargetLo ? 0 : 8;
+      for (int I : MovingInputs) {
+        // Walk the free index into the byte mask until we find an unoccupied
+        // spot. We bound this to 8 steps to catch bugs, the pigeonhole
+        // principle indicates that there *must* be a spot as we can only have
+        // 8 duplicated inputs. We have to walk the index using modular
+        // arithmetic to wrap around as necessary.
+        // FIXME: We could do a much better job of picking an inexpensive slot
+        // so this doesn't go through the worst case for the byte shuffle.
+        for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1;
+             ++j, FreeByteIdx = (FreeByteIdx + 1) % 8)
+          ;
+        assert(ByteMask[FreeByteIdx + TargetOffset] == -1 &&
+               "Failed to find a free byte!");
+        ByteMask[FreeByteIdx + TargetOffset] = I;
+        LaneMap[I] = FreeByteIdx + TargetOffset;
+      }
+      V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8),
+                                ByteMask);
+      for (int &M : Mask)
+        if (M != -1)
+          M = LaneMap[M];
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != -1)
+          I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8);
+        assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+      }
+      return DAG.getVectorShuffle(MVT::v8i16, DL,
+                                  DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                                  DAG.getUNDEF(MVT::v8i16), I16Shuffle);
+    }
+  }
+
   // Check whether an interleaving lowering is likely to be more efficient.
   // This isn't perfect but it is a strong heuristic that tends to work well on
   // the kinds of shuffles that show up in practice.

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll?rev=211974&r1=211973&r2=211974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll Sat Jun 28 00:16:40 2014
@@ -3,6 +3,93 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,7,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm2[2,1,2,3]
+; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,0,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ret <16 x i8> %shuffle
+}
+
 define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
 ; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
@@ -23,12 +110,9 @@ define <16 x i8> @shuffle_v16i8_00_16_01
 
 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
-; CHECK-SSE2:         pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,1,0,3]
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq