[llvm] f060aa1 - [x86] improve CMOV codegen by pushing add into operands

Fri Jul 23 06:41:10 PDT 2021

Author: Sanjay Patel
Date: 2021-07-23T09:39:32-04:00
New Revision: f060aa1cf3f42ca967c3f63e18381d3579bb12d9

URL: https://github.com/llvm/llvm-project/commit/f060aa1cf3f42ca967c3f63e18381d3579bb12d9
DIFF: https://github.com/llvm/llvm-project/commit/f060aa1cf3f42ca967c3f63e18381d3579bb12d9.diff

LOG: [x86] improve CMOV codegen by pushing add into operands

This is not the transform direction we want in general,
but by the time we have a CMOV, we've already tried
everything else that could be better.
The transform increases the uses of the other add operand,
but that is safe according to Alive2:
https://alive2.llvm.org/ce/z/Yn6p-A

We could probably extend this to other binops (not just add).
This is the motivating pattern discussed in:
https://llvm.org/PR51069

The test with i8 shows a missed fold because there's a trunc
sitting in front of the add. That can be handled with a small
follow-up.

Differential Revision: https://reviews.llvm.org/D106607

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/add-cmov.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 35e91dd94009..ffe361899567 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49867,6 +49867,41 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                           PMADDBuilder);
 }
 
+/// CMOV of constants requires materializing constant operands in registers.
+/// Try to fold those constants into an 'add' instruction to reduce instruction
+/// count. We do this with CMOV rather the generic 'select' because there are
+/// earlier folds that may be used to turn select-of-constants into logic hacks.
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
+  // This checks for a zero operand because add-of-0 gets simplified away.
+  // TODO: Allow generating an extra add?
+  auto isSuitableCmov = [](SDValue V) {
+    if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
+      return false;
+    return isa<ConstantSDNode>(V.getOperand(0)) &&
+           isa<ConstantSDNode>(V.getOperand(1)) &&
+           (isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)));
+  };
+
+  // Match an appropriate CMOV as the first operand of the add.
+  SDValue Cmov = N->getOperand(0);
+  SDValue OtherOp = N->getOperand(1);
+  if (!isSuitableCmov(Cmov))
+    std::swap(Cmov, OtherOp);
+  if (!isSuitableCmov(Cmov))
+    return SDValue();
+
+  // add (cmov C, 0), OtherOp --> cmov (add OtherOp, C), OtherOp
+  // add (cmov 0, C), OtherOp --> cmov OtherOp, (add OtherOp, C)
+  SDLoc DL(N);
+  SDValue FalseOp = Cmov.getOperand(0);
+  SDValue TrueOp = Cmov.getOperand(1);
+  EVT VT = N->getValueType(0);
+  FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
+  TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
+  return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
+                     Cmov.getOperand(3));
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -49874,6 +49909,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
+  if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
+    return Select;
+
   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
     return MAdd;
   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))

diff  --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll
index 71ddea4b0c94..bd3d96e3aaea 100644
--- a/llvm/test/CodeGen/X86/add-cmov.ll
+++ b/llvm/test/CodeGen/X86/add-cmov.ll
@@ -4,11 +4,9 @@
 define i64 @select_consts_i64(i64 %offset, i32 %x) {
 ; CHECK-LABEL: select_consts_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    leaq 42(%rdi), %rax
 ; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    movl $42, %eax
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    addq %rdi, %rax
+; CHECK-NEXT:    cmovneq %rdi, %rax
 ; CHECK-NEXT:    retq
   %b = icmp eq i32 %x, 0
   %s = select i1 %b, i64 42, i64 0
@@ -19,11 +17,10 @@ define i64 @select_consts_i64(i64 %offset, i32 %x) {
 define i32 @select_consts_i32(i32 %offset, i64 %x) {
 ; CHECK-LABEL: select_consts_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 43(%rdi), %eax
 ; CHECK-NEXT:    cmpq $42, %rsi
-; CHECK-NEXT:    movl $43, %eax
-; CHECK-NEXT:    cmovgel %ecx, %eax
-; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    cmovgel %edi, %eax
 ; CHECK-NEXT:    retq
   %b = icmp sgt i64 %x, 41
   %s = select i1 %b, i32 0, i32 43
@@ -34,11 +31,10 @@ define i32 @select_consts_i32(i32 %offset, i64 %x) {
 define i16 @select_consts_i16(i16 %offset, i1 %b) {
 ; CHECK-LABEL: select_consts_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 44(%rdi), %eax
 ; CHECK-NEXT:    testb $1, %sil
-; CHECK-NEXT:    movl $44, %eax
-; CHECK-NEXT:    cmovel %ecx, %eax
-; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %s = select i1 %b, i16 44, i16 0