[llvm] r287612 - [AVX-512] Add support for changing the element size of PALIGNR/VALIGND/VALIGNQ shuffles if they feed a vselect with a different type

Mon Nov 21 19:51:53 PST 2016

Author: ctopper
Date: Mon Nov 21 21:51:53 2016
New Revision: 287612

URL: http://llvm.org/viewvc/llvm-project?rev=287612&view=rev
Log:
[AVX-512] Add support for changing the element size of PALIGNR/VALIGND/VALIGNQ shuffles if they feed a vselect with a different type

Summary:
Shuffle lowering widens the element size of a shuffle if elements are contiguous. This is sometimes help because wider element types have more shuffle options. If the shuffle is one of the arguments to a vselect this shuffle widening can introduce a bitcast between the vselect and the shuffle. This will prevent isel from selecting a masked operation. If the shuffle can be written equally efficiently with a different element size to match the vselect type we should change the shuffle type to allow masking.

This patch does this conversion for all VALIGND/VALIGNQ sizes. It also supports turning 128-bit PALIGNR into VALIGND/VALIGNQ. This fixes the case shown in PR31018.

I plan to add support for more operations in future patches.

Reviewers: RKSimon, zvi, delena

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D26902

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=287612&r1=287611&r2=287612&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Nov 21 21:51:53 2016
@@ -27772,6 +27772,58 @@ static SDValue combineSelectOfTwoConstan
   return SDValue();
 }
 
+// If this is a bitcasted op that can be represented as another type, push the
+// the bitcast to the inputs. This allows more opportunities for pattern
+// matching masked instructions. This is called when we know that the operation
+// is used as one of the inputs of a vselect.
+static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  // Make sure we have a bitcast.
+  if (OrigOp.getOpcode() != ISD::BITCAST)
+    return false;
+
+  SDValue Op = OrigOp.getOperand(0);
+
+  // If the operation is used by anything other than the bitcast, we shouldn't
+  // do this combine as that would replicate the operation.
+  if (!Op.hasOneUse())
+    return false;
+
+  MVT VT = OrigOp.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  SDLoc DL(Op.getNode());
+
+  switch (Op.getOpcode()) {
+  case X86ISD::PALIGNR:
+    // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
+    if (!VT.is128BitVector())
+      return false;
+    LLVM_FALLTHROUGH;
+  case X86ISD::VALIGN: {
+    if (EltVT != MVT::i32 && EltVT != MVT::i64)
+      return false;
+    uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+    unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
+    unsigned EltSize = EltVT.getSizeInBits();
+    // Make sure we can represent the same shift with the new VT.
+    if ((ShiftAmt % EltSize) != 0)
+      return false;
+    Imm = ShiftAmt / EltSize;
+    SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
+    DCI.AddToWorklist(Op0.getNode());
+    SDValue Op1 = DAG.getBitcast(VT, Op.getOperand(1));
+    DCI.AddToWorklist(Op1.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(X86ISD::VALIGN, DL, VT, Op0, Op1,
+                              DAG.getConstant(Imm, DL, MVT::i8)));
+    return true;
+  }
+  }
+
+  return false;
+}
+
 /// Do target-specific dag combines on SELECT and VSELECT nodes.
 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
@@ -28133,6 +28185,17 @@ static SDValue combineSelect(SDNode *N,
     }
   }
 
+  // Look for vselects with LHS/RHS being bitcasted from an operation that
+  // can be executed on another type. Push the bitcast to the inputs of
+  // the operation. This exposes opportunities for using masking instructions.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    if (combineBitcastForMaskedOp(LHS, DAG, DCI))
+      return SDValue(N, 0);
+    if (combineBitcastForMaskedOp(RHS, DAG, DCI))
+      return SDValue(N, 0);
+  }
+
   return SDValue();
 }
 

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll?rev=287612&r1=287611&r2=287612&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll Mon Nov 21 21:51:53 2016
@@ -360,9 +360,9 @@ define <16 x i32> @shuffle_v16i32_01_02_
 define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) {
 ; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
 ; ALL:       # BB#0:
-; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
 ; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; ALL-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; ALL-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
   %mask.cast = bitcast i16 %mask to <16 x i1>
@@ -373,9 +373,9 @@ define <16 x i32> @mask_shuffle_v16i32_0
 define <16 x i32> @mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
 ; ALL-LABEL: mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
 ; ALL:       # BB#0:
-; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0]
 ; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vpblendmd %zmm0, %zmm2, %zmm0 {%k1}
+; ALL-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
+; ALL-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   %mask.cast = bitcast i16 %mask to <16 x i1>
@@ -386,9 +386,8 @@ define <16 x i32> @mask_shuffle_v16i32_0
 define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) {
 ; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
 ; ALL:       # BB#0:
-; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
 ; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
   %mask.cast = bitcast i16 %mask to <16 x i1>
@@ -399,9 +398,8 @@ define <16 x i32> @maskz_shuffle_v16i32_
 define <16 x i32> @maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 ; ALL-LABEL: maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
 ; ALL:       # BB#0:
-; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0]
 ; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   %mask.cast = bitcast i16 %mask to <16 x i1>

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll?rev=287612&r1=287611&r2=287612&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll Mon Nov 21 21:51:53 2016
@@ -4,9 +4,9 @@
 define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v4i32_1234:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    valignd {{.*#+}} xmm2 {%k1} = xmm0[1,2,3],xmm1[0]
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -18,9 +18,8 @@ define <4 x i32> @mask_shuffle_v4i32_123
 define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v4i32_1234:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -32,9 +31,9 @@ define <4 x i32> @maskz_shuffle_v4i32_12
 define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v4i32_2345:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    valignd {{.*#+}} xmm2 {%k1} = xmm0[2,3],xmm1[0,1]
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -46,9 +45,8 @@ define <4 x i32> @mask_shuffle_v4i32_234
 define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v4i32_2345:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3],xmm1[0,1]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -60,9 +58,9 @@ define <4 x i32> @maskz_shuffle_v4i32_23
 define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v2i64_12:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    valignq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -74,9 +72,8 @@ define <2 x i64> @mask_shuffle_v2i64_12(
 define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v2i64_12:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    valignq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -167,9 +164,9 @@ define <8 x i32> @maskz_shuffle_v8i32_12
 define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v8i32_23456789:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    valignd {{.*#+}} ymm2 {%k1} = ymm0[2,3,4,5,6,7],ymm1[0,1]
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -180,9 +177,8 @@ define <8 x i32> @mask_shuffle_v8i32_234
 define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v8i32_23456789:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,4,5,6,7],ymm1[0,1]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   %mask.cast = bitcast i8 %mask to <8 x i1>