[llvm] r352333 - [x86] add restriction for lowering to vpermps

Sun Jan 27 13:53:33 PST 2019

Author: spatel
Date: Sun Jan 27 13:53:33 2019
New Revision: 352333

URL: http://llvm.org/viewvc/llvm-project?rev=352333&view=rev
Log:
[x86] add restriction for lowering to vpermps
  
This transform was added with rL351346, and we had
an escape for shufps, but we also want one for
unpckps vs. vpermps because vpermps doesn't take
an immediate shuffle index operand.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=352333&r1=352332&r2=352333&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Jan 27 13:53:33 2019
@@ -9719,6 +9719,21 @@ static bool isUnpackWdShuffleMask(ArrayR
   return IsUnpackwdMask;
 }
 
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+  // Create 128-bit vector type based on mask size.
+  MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+  MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+  // Match any of unary/binary or low/high.
+  for (unsigned i = 0; i != 4; ++i) {
+    SmallVector<int, 16> UnpackMask;
+    createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+    if (isTargetShuffleEquivalent(Mask, UnpackMask))
+      return true;
+  }
+  return false;
+}
+
 /// Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -11709,8 +11724,10 @@ static SDValue lowerShuffleOfExtractsAsV
     return SDValue();
 
   // Final bailout: if the mask is simple, we are better off using an extract
-  // and a simple narrow shuffle.
-  if (NumElts == 4 && isSingleSHUFPSMask(NewMask))
+  // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+  // because that avoids a constant load from memory.
+  if (NumElts == 4 &&
+      (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
     return SDValue();
 
   // Extend the shuffle mask with undef elements.

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll?rev=352333&r1=352332&r2=352333&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll Sun Jan 27 13:53:33 2019
@@ -45,23 +45,15 @@ define <2 x double> @unpckh_unary_extrac
   ret <2 x double> %r
 }
 
-; FIXME: vpermps requires a constant load for the index op. It's unlikely to be profitable.
+; vpermps requires a constant load for the index op. It's unlikely to be profitable.
 
 define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
-; AVX1-LABEL: unpckh_unary_extracted_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8i32:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u>
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT:    vzeroupper
-; AVX2OR512VL-NEXT:    retq
+; ALL-LABEL: unpckh_unary_extracted_v8i32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -69,20 +61,12 @@ define <4 x i32> @unpckh_unary_extracted
 }
 
 define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
-; AVX1-LABEL: unpckh_unary_extracted_v8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f32:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u>
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT:    vzeroupper
-; AVX2OR512VL-NEXT:    retq
+; ALL-LABEL: unpckh_unary_extracted_v8f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -169,23 +153,15 @@ define <2 x double> @unpckl_unary_extrac
   ret <2 x double> %r
 }
 
-; FIXME: vpermps requires a constant load for the index op. It's unlikely to be profitable.
+; vpermps requires a constant load for the index op. It's unlikely to be profitable.
 
 define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
-; AVX1-LABEL: unpckl_unary_extracted_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8i32:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,4,1,5,u,u,u,u>
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT:    vzeroupper
-; AVX2OR512VL-NEXT:    retq
+; ALL-LABEL: unpckl_unary_extracted_v8i32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -193,20 +169,12 @@ define <4 x i32> @unpckl_unary_extracted
 }
 
 define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
-; AVX1-LABEL: unpckl_unary_extracted_v8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f32:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,4,1,5,u,u,u,u>
-; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT:    vzeroupper
-; AVX2OR512VL-NEXT:    retq
+; ALL-LABEL: unpckl_unary_extracted_v8f32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>