[llvm] 4c41caa - [x86] improve CMOV codegen by pushing add into operands, part 3

Wed Jul 28 06:11:16 PDT 2021

Author: Sanjay Patel
Date: 2021-07-28T09:10:33-04:00
New Revision: 4c41caa2871095cf1e936b0eea10079c60f864dc

URL: https://github.com/llvm/llvm-project/commit/4c41caa2871095cf1e936b0eea10079c60f864dc
DIFF: https://github.com/llvm/llvm-project/commit/4c41caa2871095cf1e936b0eea10079c60f864dc.diff

LOG: [x86] improve CMOV codegen by pushing add into operands, part 3

In this episode, we are trying to avoid an x86 micro-arch quirk where complex
(3 operand) LEA potentially costs significantly more than simple LEA. So we
simultaneously push and pull the math around the CMOV to balance the operations.

I looked at the debug spew during instruction selection and decided against
trying a later DAGToDAG transform -- it seems very difficult to match if the
trailing memops are already selected and managing the creation of extra
instructions at that level is always tricky.

Differential Revision: https://reviews.llvm.org/D106918

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/add-cmov.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3a64b3460030..144c81b3ebeb 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49961,11 +49961,34 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
   if (!isSuitableCmov(Cmov))
     return SDValue();
 
-  // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue FalseOp = Cmov.getOperand(0);
   SDValue TrueOp = Cmov.getOperand(1);
+
+  // We will push the add through the select, but we can potentially do better
+  // if we know there is another add in the sequence and this is pointer math.
+  // In that case, we can absorb an add into the trailing memory op and avoid
+  // a 3-operand LEA which is likely slower than a 2-operand LEA.
+  // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
+  if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
+      !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
+      all_of(N->uses(), [&](SDNode *Use) {
+        auto *MemNode = dyn_cast<MemSDNode>(Use);
+        return MemNode && MemNode->getBasePtr().getNode() == N;
+      })) {
+    // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
+    // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
+    //       it is possible that choosing op1 might be better.
+    SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
+    FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
+    TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
+    Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
+                       Cmov.getOperand(2), Cmov.getOperand(3));
+    return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
+  }
+
+  // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

diff  --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll
index 8868124ab4c2..6dc49e497fd2 100644
--- a/llvm/test/CodeGen/X86/add-cmov.ll
+++ b/llvm/test/CodeGen/X86/add-cmov.ll
@@ -279,11 +279,11 @@ define void @bullet_load_store(i32 %x, i64 %y, %class.btAxis* %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    leaq (%rsi,%rsi,4), %rax
 ; CHECK-NEXT:    shlq $4, %rax
+; CHECK-NEXT:    leaq 66(%rdx), %rcx
+; CHECK-NEXT:    addq $60, %rdx
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    leaq 60(%rdx,%rax), %rcx
-; CHECK-NEXT:    leaq 66(%rdx,%rax), %rax
-; CHECK-NEXT:    cmoveq %rcx, %rax
-; CHECK-NEXT:    decw (%rax)
+; CHECK-NEXT:    cmovneq %rcx, %rdx
+; CHECK-NEXT:    decw (%rdx,%rax)
 ; CHECK-NEXT:    retq
   %and = and i32 %x, 1
   %b = icmp eq i32 %and, 0
@@ -299,11 +299,11 @@ define void @bullet_load_store(i32 %x, i64 %y, %class.btAxis* %p) {
 define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) {
 ; CHECK-LABEL: complex_lea_alt1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 60(%rdx,%rsi), %rax
-; CHECK-NEXT:    leaq 66(%rdx,%rsi), %rcx
+; CHECK-NEXT:    leaq 60(%rdx), %rax
+; CHECK-NEXT:    addq $66, %rdx
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmovneq %rax, %rcx
-; CHECK-NEXT:    decw (%rcx)
+; CHECK-NEXT:    cmovneq %rax, %rdx
+; CHECK-NEXT:    decw (%rdx,%rsi)
 ; CHECK-NEXT:    retq
   %i = ptrtoint i16* %ptr to i64
   %sum = add i64 %idx, %i
@@ -320,11 +320,11 @@ define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) {
 define void @complex_lea_alt2(i1 %b, i16* readnone %ptr, i64 %idx) {
 ; CHECK-LABEL: complex_lea_alt2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 60(%rsi,%rdx), %rax
-; CHECK-NEXT:    leaq 66(%rsi,%rdx), %rcx
+; CHECK-NEXT:    leaq 60(%rsi), %rax
+; CHECK-NEXT:    addq $66, %rsi
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmovneq %rax, %rcx
-; CHECK-NEXT:    decw (%rcx)
+; CHECK-NEXT:    cmovneq %rax, %rsi
+; CHECK-NEXT:    decw (%rsi,%rdx)
 ; CHECK-NEXT:    retq
   %i = ptrtoint i16* %ptr to i64
   %sum = add i64 %i, %idx
@@ -433,11 +433,11 @@ define void @complex_lea_alt6(i1 %b, i16* readnone %ptr, i64 %idx) {
 define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) {
 ; CHECK-LABEL: complex_lea_alt7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 60(%rdx,%rsi), %rax
-; CHECK-NEXT:    leaq 66(%rdx,%rsi), %rcx
+; CHECK-NEXT:    leaq 60(%rdx), %rax
+; CHECK-NEXT:    addq $66, %rdx
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmovneq %rax, %rcx
-; CHECK-NEXT:    decw (%rcx)
+; CHECK-NEXT:    cmovneq %rax, %rdx
+; CHECK-NEXT:    decw (%rdx,%rsi)
 ; CHECK-NEXT:    retq
   %i = ptrtoint i16* %ptr to i64
   %o = add i64 %idx, %i
@@ -455,11 +455,11 @@ define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) {
 define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) {
 ; CHECK-LABEL: complex_lea_alt8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 60(%rsi,%rdx), %rax
-; CHECK-NEXT:    leaq 66(%rsi,%rdx), %rcx
+; CHECK-NEXT:    leaq 60(%rsi), %rax
+; CHECK-NEXT:    addq $66, %rsi
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmovneq %rax, %rcx
-; CHECK-NEXT:    decw (%rcx)
+; CHECK-NEXT:    cmovneq %rax, %rsi
+; CHECK-NEXT:    decw (%rsi,%rdx)
 ; CHECK-NEXT:    retq
   %i = ptrtoint i16* %ptr to i64
   %o = add i64 %i, %idx