[llvm] 985b48f - [DAGCombiner] check uses more strictly on select-of-binop fold

Wed Aug 25 11:15:16 PDT 2021

Author: Sanjay Patel
Date: 2021-08-25T14:14:41-04:00
New Revision: 985b48f18341082eba135741494f14eef3d9399e

URL: https://github.com/llvm/llvm-project/commit/985b48f18341082eba135741494f14eef3d9399e
DIFF: https://github.com/llvm/llvm-project/commit/985b48f18341082eba135741494f14eef3d9399e.diff

LOG: [DAGCombiner] check uses more strictly on select-of-binop fold

There are 2 bugs here:
1. We were not checking uses of operand 2 (the false value of the select).
2. We were not checking for multiple uses of nodes that produce >1 result.

Correcting those is enough to avoid the crash in the reduced test based on:
https://llvm.org/PR51612

The additional use check on operand 0 (the condition value of the select)
should not strictly be necessary because we are only replacing one use
with another (whether it makes performance sense to do the transform with
that pattern is not clear). But as noted in the TODO, changing that
uncovers another bug.

Note: there's at least one more bug here - we aren't propagating EVTs
correctly, but I plan to fix that in another patch.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/X86/select.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d86d06ab6827..6def3258228b 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22565,7 +22565,11 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
   if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
     return SDValue();
 
-  if (!N->isOnlyUserOf(N0.getNode()) || !N->isOnlyUserOf(N1.getNode()))
+  // The use checks are intentionally on SDNode because we may be dealing
+  // with opcodes that produce more than one SDValue.
+  // TODO: Do we really need to check N0 (the condition operand of the select)?
+  //       But removing that clause could cause an infinite loop...
+  if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
     return SDValue();
 
   // Fold select(cond, binop(x, y), binop(z, y))

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 5d168ed582cc..d950e69adf79 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
@@ -165,15 +165,15 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v2
-; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, s2, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT:    v_not_b32_e32 v3, v2
+; GFX10-NEXT:    v_mul_lo_u32 v2, s5, v2
+; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, s2, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
 ; GFX10-NEXT:    s_add_u32 s2, s2, 1
 ; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo

diff  --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 8f4ba6b883a9..7a9219c8c5fc 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -1544,3 +1544,48 @@ entry:
  %1 = select i1 %cmp10, i32 %A, i32 %0
  ret i32 %1
 }
+
+define i64 @PR51612(i64 %x, i64 %y) {
+; CHECK-LABEL: PR51612:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    incl %esi
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    andl 10, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: PR51612:
+; ATHLON:       ## %bb.0:
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; ATHLON-NEXT:    incl %edx
+; ATHLON-NEXT:    addl $1, %eax
+; ATHLON-NEXT:    adcl $0, %ecx
+; ATHLON-NEXT:    cmovbl %edx, %eax
+; ATHLON-NEXT:    andl 10, %eax
+; ATHLON-NEXT:    xorl %edx, %edx
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: PR51612:
+; MCU:       # %bb.0:
+; MCU-NEXT:    addl $1, %eax
+; MCU-NEXT:    adcl $0, %edx
+; MCU-NEXT:    jae .LBB31_2
+; MCU-NEXT:  # %bb.1:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    incl %eax
+; MCU-NEXT:  .LBB31_2:
+; MCU-NEXT:    andl 10, %eax
+; MCU-NEXT:    xorl %edx, %edx
+; MCU-NEXT:    retl
+  %add = add i64 %x, 1
+  %inc = add i64 %y, 1
+  %tobool = icmp eq i64 %add, 0
+  %sel = select i1 %tobool, i64 %inc, i64 %add
+  %i = load i32, i32* inttoptr (i32 10 to i32*), align 4
+  %conv = zext i32 %i to i64
+  %and = and i64 %sel, %conv
+  ret i64 %and
+}