[llvm] 3f76497 - [X86] Move combineIncDecVector logic from Select to PreprocessISelDAG.

Sat Feb 15 10:13:03 PST 2020

Author: Craig Topper
Date: 2020-02-15T09:59:12-08:00
New Revision: 3f7649799bedfaa97ced81c95414af23174f811e

URL: https://github.com/llvm/llvm-project/commit/3f7649799bedfaa97ced81c95414af23174f811e
DIFF: https://github.com/llvm/llvm-project/commit/3f7649799bedfaa97ced81c95414af23174f811e.diff

LOG: [X86] Move combineIncDecVector logic from Select to PreprocessISelDAG.

This allows it to work properly with masked inc/dec for avx512. Those
would have a vselect as the root node so didn't get a chance to call
combineIncDecVector.

This also simplifies the logic because we don't have to manage
the topological ordering.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/test/CodeGen/X86/avx512-arith.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2ba86738ee3d..f0b40eeef18c 100644

--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -513,7 +513,6 @@ namespace {
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
-    bool combineIncDecVector(SDNode *Node);
     bool tryShrinkShlLogicImm(SDNode *N);
     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
     bool tryMatchBitSelect(SDNode *N);
@@ -809,6 +808,38 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       continue;
     }
 
+    /// Convert vector increment or decrement to sub/add with an all-ones
+    /// constant:
+    /// add X, <1, 1...> --> sub X, <-1, -1...>
+    /// sub X, <1, 1...> --> add X, <-1, -1...>
+    /// The all-ones vector constant can be materialized using a pcmpeq
+    /// instruction that is commonly recognized as an idiom (has no register
+    /// dependency), so that's better/smaller than loading a splat 1 constant.
+    if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        N->getSimpleValueType(0).isVector()) {
+
+      APInt SplatVal;
+      if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
+          SplatVal.isOneValue()) {
+        SDLoc DL(N);
+
+        MVT VT = N->getSimpleValueType(0);
+        unsigned NumElts = VT.getSizeInBits() / 32;
+        SDValue AllOnes =
+            CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
+        AllOnes = CurDAG->getBitcast(VT, AllOnes);
+
+        unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+        SDValue Res =
+            CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
+        --I;
+        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+        ++I;
+        CurDAG->DeleteNode(N);
+        continue;
+      }
+    }
+
     switch (N->getOpcode()) {
     case ISD::FP_ROUND:
     case ISD::STRICT_FP_ROUND:
@@ -3899,52 +3930,6 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
   return true;
 }
 
-/// Convert vector increment or decrement to sub/add with an all-ones constant:
-/// add X, <1, 1...> --> sub X, <-1, -1...>
-/// sub X, <1, 1...> --> add X, <-1, -1...>
-/// The all-ones vector constant can be materialized using a pcmpeq instruction
-/// that is commonly recognized as an idiom (has no register dependency), so
-/// that's better/smaller than loading a splat 1 constant.
-bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
-  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
-         "Unexpected opcode for increment/decrement transform");
-
-  EVT VT = Node->getValueType(0);
-  assert(VT.isVector() && "Should only be called for vectors.");
-
-  SDValue X = Node->getOperand(0);
-  SDValue OneVec = Node->getOperand(1);
-
-  APInt SplatVal;
-  if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
-    return false;
-
-  SDLoc DL(Node);
-  SDValue OneConstant, AllOnesVec;
-
-  APInt Ones = APInt::getAllOnesValue(32);
-  assert(VT.getSizeInBits() % 32 == 0 &&
-         "Expected bit count to be a multiple of 32");
-  OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
-  insertDAGNode(*CurDAG, X, OneConstant);
-
-  unsigned NumElts = VT.getSizeInBits() / 32;
-  assert(NumElts > 0 && "Expected to get non-empty vector.");
-  AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
-                                           DL, OneConstant);
-  insertDAGNode(*CurDAG, X, AllOnesVec);
-
-  AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
-  insertDAGNode(*CurDAG, X, AllOnesVec);
-
-  unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
-  SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
-
-  ReplaceNode(Node, NewNode.getNode());
-  SelectCode(NewNode.getNode());
-  return true;
-}
-
 /// If the high bits of an 'and' operand are known zero, try setting the
 /// high bits of an 'and' constant operand to produce a smaller encoding by
 /// creating a small, sign-extended negative immediate rather than a large
@@ -4579,10 +4564,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::SUB: {
-    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
-        combineIncDecVector(Node))
-      return;
-
     // Try to avoid folding immediates with multiple uses for optsize.
     // This code tries to select to register form directly to avoid going
     // through the isel table which might fold the immediate. We can't change

diff  --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index be88e3530a3c..58953e287e0a 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -1153,3 +1153,29 @@ define <16 x float> @fabs_v16f32(<16 x float> %p)
   ret <16 x float> %t
 }
 declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+
+define <16 x i32> @masked_inc_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: masked_inc_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @masked_dec_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: masked_dec_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = sub <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}