[llvm] 4c41caa - [x86] improve CMOV codegen by pushing add into operands, part 3
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 28 06:11:16 PDT 2021
Author: Sanjay Patel
Date: 2021-07-28T09:10:33-04:00
New Revision: 4c41caa2871095cf1e936b0eea10079c60f864dc
URL: https://github.com/llvm/llvm-project/commit/4c41caa2871095cf1e936b0eea10079c60f864dc
DIFF: https://github.com/llvm/llvm-project/commit/4c41caa2871095cf1e936b0eea10079c60f864dc.diff
LOG: [x86] improve CMOV codegen by pushing add into operands, part 3
In this episode, we are trying to avoid an x86 micro-arch quirk where complex
(3 operand) LEA potentially costs significantly more than simple LEA. So we
simultaneously push and pull the math around the CMOV to balance the operations.
I looked at the debug spew during instruction selection and decided against
trying a later DAGToDAG transform -- it seems very difficult to match if the
trailing memops are already selected and managing the creation of extra
instructions at that level is always tricky.
Differential Revision: https://reviews.llvm.org/D106918
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/add-cmov.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3a64b3460030..144c81b3ebeb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49961,11 +49961,34 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
if (!isSuitableCmov(Cmov))
return SDValue();
- // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue FalseOp = Cmov.getOperand(0);
SDValue TrueOp = Cmov.getOperand(1);
+
+ // We will push the add through the select, but we can potentially do better
+ // if we know there is another add in the sequence and this is pointer math.
+ // In that case, we can absorb an add into the trailing memory op and avoid
+ // a 3-operand LEA which is likely slower than a 2-operand LEA.
+ // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
+ if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
+ !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
+ all_of(N->uses(), [&](SDNode *Use) {
+ auto *MemNode = dyn_cast<MemSDNode>(Use);
+ return MemNode && MemNode->getBasePtr().getNode() == N;
+ })) {
+ // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
+ // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
+ // it is possible that choosing op1 might be better.
+ SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
+ FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
+ TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
+ Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
+ Cmov.getOperand(2), Cmov.getOperand(3));
+ return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
+ }
+
+ // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll
index 8868124ab4c2..6dc49e497fd2 100644
--- a/llvm/test/CodeGen/X86/add-cmov.ll
+++ b/llvm/test/CodeGen/X86/add-cmov.ll
@@ -279,11 +279,11 @@ define void @bullet_load_store(i32 %x, i64 %y, %class.btAxis* %p) {
; CHECK: # %bb.0:
; CHECK-NEXT: leaq (%rsi,%rsi,4), %rax
; CHECK-NEXT: shlq $4, %rax
+; CHECK-NEXT: leaq 66(%rdx), %rcx
+; CHECK-NEXT: addq $60, %rdx
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: leaq 60(%rdx,%rax), %rcx
-; CHECK-NEXT: leaq 66(%rdx,%rax), %rax
-; CHECK-NEXT: cmoveq %rcx, %rax
-; CHECK-NEXT: decw (%rax)
+; CHECK-NEXT: cmovneq %rcx, %rdx
+; CHECK-NEXT: decw (%rdx,%rax)
; CHECK-NEXT: retq
%and = and i32 %x, 1
%b = icmp eq i32 %and, 0
@@ -299,11 +299,11 @@ define void @bullet_load_store(i32 %x, i64 %y, %class.btAxis* %p) {
define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt1:
; CHECK: # %bb.0:
-; CHECK-NEXT: leaq 60(%rdx,%rsi), %rax
-; CHECK-NEXT: leaq 66(%rdx,%rsi), %rcx
+; CHECK-NEXT: leaq 60(%rdx), %rax
+; CHECK-NEXT: addq $66, %rdx
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: cmovneq %rax, %rcx
-; CHECK-NEXT: decw (%rcx)
+; CHECK-NEXT: cmovneq %rax, %rdx
+; CHECK-NEXT: decw (%rdx,%rsi)
; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64
%sum = add i64 %idx, %i
@@ -320,11 +320,11 @@ define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) {
define void @complex_lea_alt2(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt2:
; CHECK: # %bb.0:
-; CHECK-NEXT: leaq 60(%rsi,%rdx), %rax
-; CHECK-NEXT: leaq 66(%rsi,%rdx), %rcx
+; CHECK-NEXT: leaq 60(%rsi), %rax
+; CHECK-NEXT: addq $66, %rsi
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: cmovneq %rax, %rcx
-; CHECK-NEXT: decw (%rcx)
+; CHECK-NEXT: cmovneq %rax, %rsi
+; CHECK-NEXT: decw (%rsi,%rdx)
; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64
%sum = add i64 %i, %idx
@@ -433,11 +433,11 @@ define void @complex_lea_alt6(i1 %b, i16* readnone %ptr, i64 %idx) {
define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt7:
; CHECK: # %bb.0:
-; CHECK-NEXT: leaq 60(%rdx,%rsi), %rax
-; CHECK-NEXT: leaq 66(%rdx,%rsi), %rcx
+; CHECK-NEXT: leaq 60(%rdx), %rax
+; CHECK-NEXT: addq $66, %rdx
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: cmovneq %rax, %rcx
-; CHECK-NEXT: decw (%rcx)
+; CHECK-NEXT: cmovneq %rax, %rdx
+; CHECK-NEXT: decw (%rdx,%rsi)
; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64
%o = add i64 %idx, %i
@@ -455,11 +455,11 @@ define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) {
define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt8:
; CHECK: # %bb.0:
-; CHECK-NEXT: leaq 60(%rsi,%rdx), %rax
-; CHECK-NEXT: leaq 66(%rsi,%rdx), %rcx
+; CHECK-NEXT: leaq 60(%rsi), %rax
+; CHECK-NEXT: addq $66, %rsi
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: cmovneq %rax, %rcx
-; CHECK-NEXT: decw (%rcx)
+; CHECK-NEXT: cmovneq %rax, %rsi
+; CHECK-NEXT: decw (%rsi,%rdx)
; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64
%o = add i64 %i, %idx
More information about the llvm-commits
mailing list