[llvm] r342886 - [DAGCombiner] use UADDO to optimize saturated unsigned add

Mon Sep 24 07:47:15 PDT 2018

Author: spatel
Date: Mon Sep 24 07:47:15 2018
New Revision: 342886

URL: http://llvm.org/viewvc/llvm-project?rev=342886&view=rev
Log:
[DAGCombiner] use UADDO to optimize saturated unsigned add

This is a preliminary step towards solving PR14613:
https://bugs.llvm.org/show_bug.cgi?id=14613

If we have an 'add' instruction that sets flags, we can use that to eliminate an
explicit compare instruction or some other instruction (cmn) that sets flags for 
use in the later select.

As shown in the unchanged tests that use 'icmp ugt %x, %a', we're effectively 
reversing an IR icmp canonicalization that replaces a variable operand with a
constant:
https://rise4fun.com/Alive/V1Q

But we're not using 'uaddo' in those cases via DAG transforms. This happens in 
CGP after D8889 without checking target lowering to see if the op is supported. 
So AArch already shows 'uaddo' codegen for the i8/i16/i32/i64 test variants with 
"using_cmp_sum" in the title. That's the pattern that CGP matches as an unsigned 
saturated add and converts to uaddo without checking target capabilities.

This patch is gated by isOperationLegalOrCustom(ISD::UADDO, VT), so we see only 
see AArch diffs for i32/i64 in the tests with "using_cmp_notval" in the title 
(unlike x86 which sees improvements for all sizes because all sizes are 'custom'). 
But the AArch code (like x86) looks better when translated to 'uaddo' in all cases. 
So someone that is involved with AArch may want to set i8/i16 to 'custom' for UADDO, 
so this patch will fire on those tests.

Another possibility given the existing behavior: we could remove the legal-or-custom 
check altogether because we're assuming that a UADDO sequence is canonical/optimal 
before we ever reach here. But that seems like a bug to me. If the target doesn't 
have an add-with-flags op, then it's not likely that we'll get optimal DAG combining 
using a UADDO node. This is similar justification for why we don't canonicalize IR to 
the overflow math intrinsic sibling (llvm.uadd.with.overflow) for UADDO in the first 
place.

Differential Revision: https://reviews.llvm.org/D51929

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/AArch64/sat-add.ll
    llvm/trunk/test/CodeGen/X86/sat-add.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=342886&r1=342885&r2=342886&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Mon Sep 24 07:47:15 2018
@@ -7346,6 +7346,35 @@ SDValue DAGCombiner::visitSELECT(SDNode
                                                 CC, TLI, DAG))
         return FMinMax;
 
+    // Use 'unsigned add with overflow' to optimize an unsigned saturating add.
+    // This is conservatively limited to pre-legal-operations to give targets
+    // a chance to reverse the transform if they want to do that. Also, it is
+    // unlikely that the pattern would be formed late, so it's probably not
+    // worth going through the other checks.
+    if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) &&
+        CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) &&
+        N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) {
+      auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1));
+      auto *NotC = dyn_cast<ConstantSDNode>(Cond1);
+      if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) {
+        // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) -->
+        // uaddo Cond0, C; select uaddo.1, -1, uaddo.0
+        //
+        // The IR equivalent of this transform would have this form:
+        //   %a = add %x, C
+        //   %c = icmp ugt %x, ~C
+        //   %r = select %c, -1, %a
+        //   =>
+        //   %u = call {iN,i1} llvm.uadd.with.overflow(%x, C)
+        //   %u0 = extractvalue %u, 0
+        //   %u1 = extractvalue %u, 1
+        //   %r = select %u1, -1, %u0
+        SDVTList VTs = DAG.getVTList(VT, VT0);
+        SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1));
+        return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0));
+      }
+    }
+
     if (TLI.isOperationLegal(ISD::SELECT_CC, VT) ||
         (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)))
       return DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, N2,

Modified: llvm/trunk/test/CodeGen/AArch64/sat-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/sat-add.ll?rev=342886&r1=342885&r2=342886&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/sat-add.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/sat-add.ll Mon Sep 24 07:47:15 2018
@@ -123,9 +123,8 @@ define i32 @unsigned_sat_constant_i32_us
 define i32 @unsigned_sat_constant_i32_using_cmp_notval(i32 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, #42 // =42
-; CHECK-NEXT:    cmn w0, #43 // =43
-; CHECK-NEXT:    csinv w0, w8, wzr, ls
+; CHECK-NEXT:    adds w8, w0, #42 // =42
+; CHECK-NEXT:    csinv w0, w8, wzr, lo
 ; CHECK-NEXT:    ret
   %a = add i32 %x, 42
   %c = icmp ugt i32 %x, -43
@@ -162,9 +161,8 @@ define i64 @unsigned_sat_constant_i64_us
 define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #42 // =42
-; CHECK-NEXT:    cmn x0, #43 // =43
-; CHECK-NEXT:    csinv x0, x8, xzr, ls
+; CHECK-NEXT:    adds x8, x0, #42 // =42
+; CHECK-NEXT:    csinv x0, x8, xzr, lo
 ; CHECK-NEXT:    ret
   %a = add i64 %x, 42
   %c = icmp ugt i64 %x, -43

Modified: llvm/trunk/test/CodeGen/X86/sat-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sat-add.ll?rev=342886&r1=342885&r2=342886&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sat-add.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sat-add.ll Mon Sep 24 07:47:15 2018
@@ -44,11 +44,10 @@ define i8 @unsigned_sat_constant_i8_usin
 define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i8_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    cmpb $-43, %dil
+; ANY-NEXT:    addb $42, %dil
 ; ANY-NEXT:    movb $-1, %al
-; ANY-NEXT:    ja .LBB2_2
+; ANY-NEXT:    jb .LBB2_2
 ; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    addb $42, %dil
 ; ANY-NEXT:    movl %edi, %eax
 ; ANY-NEXT:  .LBB2_2:
 ; ANY-NEXT:    retq
@@ -91,12 +90,9 @@ define i16 @unsigned_sat_constant_i16_us
 define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i16_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    # kill: def $edi killed $edi def $rdi
-; ANY-NEXT:    leal 42(%rdi), %ecx
-; ANY-NEXT:    movzwl %di, %eax
-; ANY-NEXT:    cmpl $65493, %eax # imm = 0xFFD5
+; ANY-NEXT:    addw $42, %di
 ; ANY-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; ANY-NEXT:    cmovbel %ecx, %eax
+; ANY-NEXT:    cmovael %edi, %eax
 ; ANY-NEXT:    # kill: def $ax killed $ax killed $eax
 ; ANY-NEXT:    retq
   %a = add i16 %x, 42
@@ -135,11 +131,9 @@ define i32 @unsigned_sat_constant_i32_us
 define i32 @unsigned_sat_constant_i32_using_cmp_notval(i32 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i32_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    # kill: def $edi killed $edi def $rdi
-; ANY-NEXT:    leal 42(%rdi), %ecx
-; ANY-NEXT:    cmpl $-43, %edi
+; ANY-NEXT:    addl $42, %edi
 ; ANY-NEXT:    movl $-1, %eax
-; ANY-NEXT:    cmovbel %ecx, %eax
+; ANY-NEXT:    cmovael %edi, %eax
 ; ANY-NEXT:    retq
   %a = add i32 %x, 42
   %c = icmp ugt i32 %x, -43
@@ -177,10 +171,9 @@ define i64 @unsigned_sat_constant_i64_us
 define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i64_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    cmpq $-43, %rdi
-; ANY-NEXT:    leaq 42(%rdi), %rax
-; ANY-NEXT:    movq $-1, %rcx
-; ANY-NEXT:    cmovaq %rcx, %rax
+; ANY-NEXT:    addq $42, %rdi
+; ANY-NEXT:    movq $-1, %rax
+; ANY-NEXT:    cmovaeq %rdi, %rax
 ; ANY-NEXT:    retq
   %a = add i64 %x, 42
   %c = icmp ugt i64 %x, -43