[llvm] r275270 - [X86][AVX] Add support for target shuffle combining to VPERMILPS variable shuffle mask

Wed Jul 13 08:10:44 PDT 2016

Author: rksimon
Date: Wed Jul 13 10:10:43 2016
New Revision: 275270

URL: http://llvm.org/viewvc/llvm-project?rev=275270&view=rev
Log:
[X86][AVX] Add support for target shuffle combining to VPERMILPS variable shuffle mask

Added AVX512F VPERMILPS shuffle decoding support

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=275270&r1=275269&r2=275270&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jul 13 10:10:43 2016
@@ -3829,6 +3829,7 @@ static bool isTargetShuffleVariableMask(
   switch (Opcode) {
   default: return false;
   case X86ISD::PSHUFB:
+  case X86ISD::VPERMILPV:
     return true;
   }
 }
@@ -25211,13 +25212,42 @@ static bool combineX86ShuffleChain(SDVal
   if (Depth < 2)
     return false;
 
+  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
+    return false;
+
+  bool MaskContainsZeros =
+      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // the 128-bit lanes use the variable mask to VPERMILPS.
+  // TODO Combine other mask types at higher depths.
+  if (HasVariableMask && !MaskContainsZeros &&
+      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+    SmallVector<SDValue, 16> VPermIdx;
+    for (int M : Mask) {
+      SDValue Idx =
+          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+      VPermIdx.push_back(Idx);
+    }
+    MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+    SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+    DCI.AddToWorklist(VPermMask.getNode());
+    Res = DAG.getBitcast(MaskVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
   // If we have 3 or more shuffle instructions or a chain involving a variable
   // mask, we can replace them with a single PSHUFB instruction profitably.
   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
   // instructions, but in practice PSHUFB tends to be *very* fast so we're
   // more aggressive.
   if ((Depth >= 3 || HasVariableMask) &&
-      !is128BitLaneCrossingShuffleMask(MaskVT, Mask) &&
       ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
        (VT.is256BitVector() && Subtarget.hasAVX2()) ||
        (VT.is512BitVector() && Subtarget.hasBWI()))) {

Modified: llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp?rev=275270&r1=275269&r2=275270&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp Wed Jul 13 10:10:43 2016
@@ -104,9 +104,11 @@ void DecodeVPERMILPMask(const Constant *
   //   <4 x i32> <i32 -2147483648, i32 -2147483648,
   //              i32 -2147483648, i32 -2147483648>
 
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (ElSize != 32 && ElSize != 64)
+    return;
 
-  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512)
     return;
 
   // Only support vector types.
@@ -126,7 +128,8 @@ void DecodeVPERMILPMask(const Constant *
     return;
 
   unsigned NumElements = MaskTySize / ElSize;
-  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8 ||
+          NumElements == 16) &&
          "Unexpected number of vector elements.");
   ShuffleMask.reserve(NumElements);
   unsigned NumElementsPerLane = 128 / ElSize;

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll?rev=275270&r1=275269&r2=275270&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll Wed Jul 13 10:10:43 2016
@@ -91,6 +91,16 @@ define <8 x float> @combine_vpermilvar_8
   ret <8 x float> %2
 }
 
+define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_10326u4u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
+; ALL-NEXT:    retq
+  %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
+  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
+  ret <8 x float> %2
+}
+
 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
 ; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32:
 ; ALL:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll?rev=275270&r1=275269&r2=275270&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll Wed Jul 13 10:10:43 2016
@@ -3,6 +3,8 @@
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
 declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
 declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
@@ -437,6 +439,16 @@ define <8 x double> @combine_permvar_8f6
   ret <8 x double> %1
 }
 
+define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
+; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
+  ret <16 x float> %res1
+}
+
 define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pslldq:
 ; CHECK:       # BB#0: