[llvm] a2963d8 - [SDAG] fold sub-of-shift to add-of-shift

Fri Feb 18 08:56:00 PST 2022

Author: Sanjay Patel
Date: 2022-02-18T11:55:50-05:00
New Revision: a2963d871ee5c2786de409b42f67ffcc39e53184

URL: https://github.com/llvm/llvm-project/commit/a2963d871ee5c2786de409b42f67ffcc39e53184
DIFF: https://github.com/llvm/llvm-project/commit/a2963d871ee5c2786de409b42f67ffcc39e53184.diff

LOG: [SDAG] fold sub-of-shift to add-of-shift

This fold is done in IR:
https://alive2.llvm.org/ce/z/jWyFrP

There is an x86 test that shows an improvement
from the added flexibility of using add (commutative).

The other diffs are presumed neutral.

Note that this could also be folded to an 'xor',
but I'm not sure if that would be universally better
(eg, x86 can convert adds more easily into LEA).

This helps prevent regressions from a potential fold for
issue #53829.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
    llvm/test/CodeGen/X86/combine-srem.ll
    llvm/test/CodeGen/X86/imul.ll
    llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a0708336d26a2..89c3e41392882 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3656,6 +3656,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
+  // As with the previous fold, prefer add for more folding potential.
+  // Subtracting SMIN/0 is the same as adding SMIN/0:
+  // N0 - (X << BW-1) --> N0 + (X << BW-1)
+  if (N1.getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
+    if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
+      return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
+  }
+
   if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
     // (sub Carry, X)  ->  (addcarry (sub 0, X), 0, Carry)
     if (SDValue Carry = getAsCarry(TLI, N0)) {

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index fc033bc741c19..c6f7377794413 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -208,7 +208,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    movi v3.4s, #128, lsl #24
 ; CHECK-NEXT:    usra v1.4s, v2.4s, #1
 ; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b

diff  --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index be2b7b86c8aed..575f37117f2da 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -74,7 +74,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
 ; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    paddd %xmm0, %xmm1
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_srem_by_minsigned:
@@ -83,7 +83,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_srem_by_minsigned:
@@ -93,7 +93,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
   %1 = srem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
   ret <4 x i32> %1

diff  --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll
index 4a4b159c68ef7..9131688c4efcc 100644
--- a/llvm/test/CodeGen/X86/imul.ll
+++ b/llvm/test/CodeGen/X86/imul.ll
@@ -529,9 +529,8 @@ define i64 @testNegOverflow(i64 %a) {
 ; X64-LABEL: testNegOverflow:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    shlq $63, %rcx
-; X64-NEXT:    subq %rcx, %rax
+; X64-NEXT:    shlq $63, %rax
+; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: testNegOverflow:

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index 4aa45ecde8736..95eb23fc3cd5d 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -622,7 +622,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-SSE-NEXT:    psrld $1, %xmm1
 ; CHECK-SSE-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    psubd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    paddd %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    psrld $31, %xmm0
@@ -634,7 +634,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -647,7 +647,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; CHECK-AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -659,7 +659,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpsrld $31, %xmm0, %xmm0